diff options
Diffstat (limited to 'fs')
720 files changed, 18205 insertions, 14899 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 1286d96a29bc..862164181bac 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -59,7 +59,7 @@ static void v9fs_issue_write(struct netfs_io_subrequest *subreq) len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err); if (len > 0) __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); - netfs_write_subrequest_terminated(subreq, len ?: err, false); + netfs_write_subrequest_terminated(subreq, len ?: err); } /** @@ -77,7 +77,8 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) /* if we just extended the file size, any portion not in * cache won't be on server and is zeroes */ - if (subreq->rreq->origin != NETFS_DIO_READ) + if (subreq->rreq->origin != NETFS_UNBUFFERED_READ && + subreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); if (pos + total >= i_size_read(rreq->inode)) __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index 5061f192eafd..04795508a795 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -127,7 +127,6 @@ const struct dentry_operations v9fs_cached_dentry_operations = { }; const struct dentry_operations v9fs_dentry_operations = { - .d_delete = always_delete_dentry, .d_release = v9fs_dentry_release, .d_unalias_trylock = v9fs_dentry_unalias_trylock, .d_unalias_unlock = v9fs_dentry_unalias_unlock, diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 348cc90bf9c5..eb0b083da269 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -454,9 +454,10 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, } static int -v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma) +v9fs_file_mmap_prepare(struct vm_area_desc *desc) { int retval; + struct file *filp = desc->file; struct inode *inode = file_inode(filp); struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); @@ -464,12 +465,12 @@ v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma) if (!(v9ses->cache & CACHE_WRITEBACK)) { p9_debug(P9_DEBUG_CACHE, "(read-only mmap mode)"); - return generic_file_readonly_mmap(filp, vma); + return generic_file_readonly_mmap_prepare(desc); } - retval = generic_file_mmap(filp, vma); + retval = generic_file_mmap_prepare(desc); if (!retval) - vma->vm_ops = &v9fs_mmap_file_vm_ops; + desc->vm_ops = &v9fs_mmap_file_vm_ops; return retval; } @@ -516,7 +517,7 @@ const struct file_operations v9fs_file_operations = { .open = v9fs_file_open, .release = v9fs_dir_release, .lock = v9fs_file_lock, - .mmap = generic_file_readonly_mmap, + .mmap_prepare = generic_file_readonly_mmap_prepare, .splice_read = v9fs_file_splice_read, .splice_write = iter_file_splice_write, .fsync = v9fs_file_fsync, @@ -531,7 +532,7 @@ const struct file_operations v9fs_file_operations_dotl = { .release = v9fs_dir_release, .lock = v9fs_file_lock_dotl, .flock = v9fs_file_flock_dotl, - .mmap = v9fs_file_mmap, + .mmap_prepare = v9fs_file_mmap_prepare, .splice_read = v9fs_file_splice_read, .splice_write = iter_file_splice_write, .fsync = v9fs_file_fsync_dotl, diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 489db161abc9..795c6388744c 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -134,10 +134,12 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, if (retval) goto release_sb; - if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) - sb->s_d_op = &v9fs_cached_dentry_operations; - else - sb->s_d_op = &v9fs_dentry_operations; + if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { + set_default_d_op(sb, &v9fs_cached_dentry_operations); + } else { + set_default_d_op(sb, &v9fs_dentry_operations); + sb->s_d_flags |= DCACHE_DONTCACHE; + } inode = v9fs_get_new_inode_from_fid(v9ses, fid, sb); if (IS_ERR(inode)) { diff --git a/fs/Kconfig b/fs/Kconfig index 44b6cdd36dc1..c654a3642897 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -59,7 +59,7 @@ endif # BLOCK config FS_DAX bool "File system based Direct Access (DAX) support" depends on MMU - depends on ZONE_DEVICE || FS_DAX_LIMITED + depends on ZONE_DEVICE select FS_IOMAP select DAX help @@ -95,13 +95,6 @@ config FS_DAX_PMD depends on ZONE_DEVICE depends on TRANSPARENT_HUGEPAGE -# Selected by DAX drivers that do not expect filesystem DAX to support -# get_user_pages() of DAX mappings. I.e. "limited" indicates no support -# for fork() of processes with MAP_SHARED mappings or support for -# direct-I/O to a DAX mapping. -config FS_DAX_LIMITED - bool - # Posix ACL utility routines # # Note: Posix ACLs can be implemented without these helpers. Never use @@ -256,7 +249,7 @@ config ARCH_SUPPORTS_HUGETLBFS menuconfig HUGETLBFS bool "HugeTLB file system support" - depends on X86 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN + depends on ARCH_SUPPORTS_HUGETLBFS depends on (SYSFS || SYSCTL) select MEMFD_CREATE select PADATA if SMP diff --git a/fs/Makefile b/fs/Makefile index 79c08b914c47..334654f9584b 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -15,7 +15,8 @@ obj-y := open.o read_write.o file_table.o super.o \ pnode.o splice.o sync.o utimes.o d_path.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ - kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o + kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \ + file_attr.o obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o obj-$(CONFIG_PROC_FS) += proc_namespace.o diff --git a/fs/adfs/file.c b/fs/adfs/file.c index ee80718aaeec..cd13165fd904 100644 --- a/fs/adfs/file.c +++ b/fs/adfs/file.c @@ -25,7 +25,7 @@ const struct file_operations adfs_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, - .mmap = generic_file_mmap, + .mmap_prepare = generic_file_mmap_prepare, .fsync = generic_file_fsync, .write_iter = generic_file_write_iter, .splice_read = filemap_splice_read, diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 21527189e430..6830f8bc8d4e 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -53,13 +53,14 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to) truncate_pagecache(inode, inode->i_size); } -static int adfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int adfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; - ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, + ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata, adfs_get_block, &ADFS_I(mapping->host)->mmu_private); if (unlikely(ret)) diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 017c48a80203..fdccdbbfc213 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -397,7 +397,7 @@ static int adfs_fill_super(struct super_block *sb, struct fs_context *fc) if (asb->s_ftsuffix) asb->s_namelen += 4; - sb->s_d_op = &adfs_dentry_operations; + set_default_d_op(sb, &adfs_dentry_operations); root = adfs_iget(sb, &root_obj); sb->s_root = d_make_root(root); if (!sb->s_root) { diff --git a/fs/affs/file.c b/fs/affs/file.c index 7a71018e3f67..765c3443663e 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -415,13 +415,14 @@ affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return ret; } -static int affs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int affs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; - ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, + ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata, affs_get_block, &AFFS_I(mapping->host)->mmu_private); if (unlikely(ret)) @@ -430,14 +431,15 @@ static int affs_write_begin(struct file *file, struct address_space *mapping, return ret; } -static int affs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned int len, unsigned int copied, +static int affs_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, + unsigned int len, unsigned int copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int ret; - ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); + ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); /* Clear Archived bit on file writes, as AmigaOS would do */ if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) { @@ -645,7 +647,8 @@ static int affs_read_folio_ofs(struct file *file, struct folio *folio) return err; } -static int affs_write_begin_ofs(struct file *file, struct address_space *mapping, +static int affs_write_begin_ofs(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -684,9 +687,10 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping return err; } -static int affs_write_end_ofs(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int affs_write_end_ofs(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; @@ -999,7 +1003,7 @@ const struct file_operations affs_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, - .mmap = generic_file_mmap, + .mmap_prepare = generic_file_mmap_prepare, .open = affs_file_open, .release = affs_file_release, .fsync = affs_file_fsync, diff --git a/fs/affs/super.c b/fs/affs/super.c index 2fa40337776d..44f8aa883100 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -500,9 +500,9 @@ got_root: return PTR_ERR(root_inode); if (affs_test_opt(AFFS_SB(sb)->s_flags, SF_INTL)) - sb->s_d_op = &affs_intl_dentry_operations; + set_default_d_op(sb, &affs_intl_dentry_operations); else - sb->s_d_op = &affs_dentry_operations; + set_default_d_op(sb, &affs_dentry_operations); sb->s_root = d_make_root(root_inode); if (!sb->s_root) { diff --git a/fs/afs/addr_prefs.c b/fs/afs/addr_prefs.c index c0384201b8fe..133736412c3d 100644 --- a/fs/afs/addr_prefs.c +++ b/fs/afs/addr_prefs.c @@ -48,7 +48,7 @@ static int afs_split_string(char **pbuf, char *strv[], unsigned int maxstrv) strv[count++] = p; /* Skip over word */ - while (!isspace(*p)) + while (!isspace(*p) && *p) p++; if (!*p) break; diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 0168bbf53fe0..f31359922e98 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -177,6 +177,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, VL_SERVICE, AFS_VL_PORT); if (IS_ERR(vllist)) { ret = PTR_ERR(vllist); + vllist = NULL; goto parse_failed; } diff --git a/fs/afs/file.c b/fs/afs/file.c index fc15497608c6..f66a92294284 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -19,7 +19,7 @@ #include <trace/events/netfs.h> #include "internal.h" -static int afs_file_mmap(struct file *file, struct vm_area_struct *vma); +static int afs_file_mmap_prepare(struct vm_area_desc *desc); static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter); static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos, @@ -35,7 +35,7 @@ const struct file_operations afs_file_operations = { .llseek = generic_file_llseek, .read_iter = afs_file_read_iter, .write_iter = netfs_file_write_iter, - .mmap = afs_file_mmap, + .mmap_prepare = afs_file_mmap_prepare, .splice_read = afs_file_splice_read, .splice_write = iter_file_splice_write, .fsync = afs_fsync, @@ -492,16 +492,16 @@ static void afs_drop_open_mmap(struct afs_vnode *vnode) /* * Handle setting up a memory mapping on an AFS file. */ -static int afs_file_mmap(struct file *file, struct vm_area_struct *vma) +static int afs_file_mmap_prepare(struct vm_area_desc *desc) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(desc->file)); int ret; afs_add_open_mmap(vnode); - ret = generic_file_mmap(file, vma); + ret = generic_file_mmap_prepare(desc); if (ret == 0) - vma->vm_ops = &afs_vm_ops; + desc->vm_ops = &afs_vm_ops; else afs_drop_open_mmap(vnode); return ret; diff --git a/fs/afs/super.c b/fs/afs/super.c index 25b306db6992..da407f2d6f0d 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -483,9 +483,9 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) goto error; if (as->dyn_root) { - sb->s_d_op = &afs_dynroot_dentry_operations; + set_default_d_op(sb, &afs_dynroot_dentry_operations); } else { - sb->s_d_op = &afs_fs_dentry_operations; + set_default_d_op(sb, &afs_fs_dentry_operations); rcu_assign_pointer(as->volume->sb, sb); } diff --git a/fs/afs/write.c b/fs/afs/write.c index 18b0a9f1615e..2e7526ea883a 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -120,17 +120,17 @@ static void afs_issue_write_worker(struct work_struct *work) #if 0 // Error injection if (subreq->debug_index == 3) - return netfs_write_subrequest_terminated(subreq, -ENOANO, false); + return netfs_write_subrequest_terminated(subreq, -ENOANO); if (!subreq->retry_count) { set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - return netfs_write_subrequest_terminated(subreq, -EAGAIN, false); + return netfs_write_subrequest_terminated(subreq, -EAGAIN); } #endif op = afs_alloc_operation(wreq->netfs_priv, vnode->volume); if (IS_ERR(op)) - return netfs_write_subrequest_terminated(subreq, -EAGAIN, false); + return netfs_write_subrequest_terminated(subreq, -EAGAIN); afs_op_set_vnode(op, 0, vnode); op->file[0].dv_delta = 1; @@ -166,7 +166,7 @@ static void afs_issue_write_worker(struct work_struct *work) break; } - netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len, false); + netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len); } void afs_issue_write(struct netfs_io_subrequest *subreq) @@ -202,6 +202,7 @@ void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *st case NETFS_READ_GAPS: case NETFS_READ_SINGLE: case NETFS_READ_FOR_WRITE: + case NETFS_UNBUFFERED_READ: case NETFS_DIO_READ: return; default: @@ -392,15 +392,15 @@ static const struct vm_operations_struct aio_ring_vm_ops = { #endif }; -static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) +static int aio_ring_mmap_prepare(struct vm_area_desc *desc) { - vm_flags_set(vma, VM_DONTEXPAND); - vma->vm_ops = &aio_ring_vm_ops; + desc->vm_flags |= VM_DONTEXPAND; + desc->vm_ops = &aio_ring_vm_ops; return 0; } static const struct file_operations aio_ring_fops = { - .mmap = aio_ring_mmap, + .mmap_prepare = aio_ring_mmap_prepare, }; #if IS_ENABLED(CONFIG_MIGRATION) diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index e51e7d88980a..1d847a939f29 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -98,14 +98,25 @@ static struct file_system_type anon_inode_fs_type = { .kill_sb = kill_anon_super, }; -static struct inode *anon_inode_make_secure_inode( - const char *name, - const struct inode *context_inode) +/** + * anon_inode_make_secure_inode - allocate an anonymous inode with security context + * @sb: [in] Superblock to allocate from + * @name: [in] Name of the class of the newfile (e.g., "secretmem") + * @context_inode: + * [in] Optional parent inode for security inheritance + * + * The function ensures proper security initialization through the LSM hook + * security_inode_init_security_anon(). + * + * Return: Pointer to new inode on success, ERR_PTR on failure. + */ +struct inode *anon_inode_make_secure_inode(struct super_block *sb, const char *name, + const struct inode *context_inode) { struct inode *inode; int error; - inode = alloc_anon_inode(anon_inode_mnt->mnt_sb); + inode = alloc_anon_inode(sb); if (IS_ERR(inode)) return inode; inode->i_flags &= ~S_PRIVATE; @@ -118,6 +129,7 @@ static struct inode *anon_inode_make_secure_inode( } return inode; } +EXPORT_SYMBOL_GPL_FOR_MODULES(anon_inode_make_secure_inode, "kvm"); static struct file *__anon_inode_getfile(const char *name, const struct file_operations *fops, @@ -132,7 +144,8 @@ static struct file *__anon_inode_getfile(const char *name, return ERR_PTR(-ENOENT); if (make_inode) { - inode = anon_inode_make_secure_inode(name, context_inode); + inode = anon_inode_make_secure_inode(anon_inode_mnt->mnt_sb, + name, context_inode); if (IS_ERR(inode)) { file = ERR_CAST(inode); goto err; diff --git a/fs/attr.c b/fs/attr.c index 9caf63d20d03..5425c1dbbff9 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -230,7 +230,7 @@ EXPORT_SYMBOL(setattr_prepare); * @inode: the inode to be truncated * @offset: the new size to assign to the inode * - * inode_newsize_ok must be called with i_mutex held. + * inode_newsize_ok must be called with i_rwsem held exclusively. * * inode_newsize_ok will check filesystem limits and ulimits to check that the * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ @@ -318,7 +318,7 @@ static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr) * @inode: the inode to be updated * @attr: the new attributes * - * setattr_copy must be called with i_mutex held. + * setattr_copy must be called with i_rwsem held exclusively. * * setattr_copy updates the inode's metadata with that specified * in attr on idmapped mounts. Necessary permission checks to determine @@ -403,13 +403,13 @@ EXPORT_SYMBOL(may_setattr); * @attr: new attributes * @delegated_inode: returns inode, if the inode is delegated * - * The caller must hold the i_mutex on the affected object. + * The caller must hold the i_rwsem exclusively on the affected object. * * If notify_change discovers a delegation in need of breaking, * it will return -EWOULDBLOCK and return a reference to the inode in * delegated_inode. The caller should then break the delegation and * retry. Because breaking a delegation may take a long time, the - * caller should drop the i_mutex before doing so. + * caller should drop the i_rwsem before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not @@ -456,7 +456,7 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - /* Flag setting protected by i_mutex */ + /* Flag setting protected by i_rwsem */ if (is_sxid(attr->ia_mode)) inode->i_flags &= ~S_NOSEC; } diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index ee2edccaef70..f5c16ffba013 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -311,7 +311,7 @@ static int autofs_fill_super(struct super_block *s, struct fs_context *fc) s->s_blocksize_bits = 10; s->s_magic = AUTOFS_SUPER_MAGIC; s->s_op = &autofs_sops; - s->s_d_op = &autofs_dentry_operations; + set_default_d_op(s, &autofs_dentry_operations); s->s_time_gran = 1; /* diff --git a/fs/backing-file.c b/fs/backing-file.c index 763fbe9b72b2..15a7f8031084 100644 --- a/fs/backing-file.c +++ b/fs/backing-file.c @@ -41,7 +41,7 @@ struct file *backing_file_open(const struct path *user_path, int flags, return f; path_get(user_path); - *backing_file_user_path(f) = *user_path; + backing_file_set_user_path(f, user_path); error = vfs_open(real_path, f); if (error) { fput(f); @@ -65,7 +65,7 @@ struct file *backing_tmpfile_open(const struct path *user_path, int flags, return f; path_get(user_path); - *backing_file_user_path(f) = *user_path; + backing_file_set_user_path(f, user_path); error = vfs_tmpfile(real_idmap, real_parentpath, f, mode); if (error) { fput(f); @@ -333,13 +333,13 @@ int backing_file_mmap(struct file *file, struct vm_area_struct *vma, if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) return -EIO; - if (!file->f_op->mmap) + if (!can_mmap_file(file)) return -ENODEV; vma_set_file(vma, file); old_cred = override_creds(ctx->cred); - ret = call_mmap(vma->vm_file, vma); + ret = vfs_mmap(vma->vm_file, vma); revert_creds(old_cred); if (ctx->accessed) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 173e81c2bbcb..66de46318620 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -21,7 +21,6 @@ #include "error.h" #include "lru.h" #include "recovery.h" -#include "trace.h" #include "varint.h" #include <linux/kthread.h> @@ -337,11 +336,10 @@ void bch2_alloc_v4_swab(struct bkey_s k) a->stripe_sectors = swab32(a->stripe_sectors); } -void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +static inline void __bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, + unsigned dev, const struct bch_alloc_v4 *a) { - struct bch_alloc_v4 _a; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); - struct bch_dev *ca = c ? bch2_dev_bucket_tryget_noerror(c, k.k->p) : NULL; + struct bch_dev *ca = c ? bch2_dev_tryget_noerror(c, dev) : NULL; prt_newline(out); printbuf_indent_add(out, 2); @@ -369,6 +367,19 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c bch2_dev_put(ca); } +void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + struct bch_alloc_v4 _a; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); + + __bch2_alloc_v4_to_text(out, c, k.k->p.inode, a); +} + +void bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + __bch2_alloc_v4_to_text(out, c, k.k->p.inode, bkey_s_c_to_alloc_v4(k).v); +} + void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) { if (k.k->type == KEY_TYPE_alloc_v4) { @@ -697,8 +708,8 @@ static int __need_discard_or_freespace_err(struct btree_trans *trans, set ? "" : "un", bch2_btree_id_str(btree), buf.buf); - if (ret == -BCH_ERR_fsck_ignore || - ret == -BCH_ERR_fsck_errors_not_fixed) + if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) || + bch2_err_matches(ret, BCH_ERR_fsck_errors_not_fixed)) ret = 0; printbuf_exit(&buf); @@ -854,7 +865,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); if (!ca) - return -BCH_ERR_trigger_alloc; + return bch_err_throw(c, trigger_alloc); struct bch_alloc_v4 old_a_convert; const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); @@ -988,14 +999,11 @@ int bch2_trigger_alloc(struct btree_trans *trans, } if (new_a->gen != old_a->gen) { - rcu_read_lock(); + guard(rcu)(); u8 *gen = bucket_gen(ca, new.k->p.offset); - if (unlikely(!gen)) { - rcu_read_unlock(); + if (unlikely(!gen)) goto invalid_bucket; - } *gen = new_a->gen; - rcu_read_unlock(); } #define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) @@ -1021,15 +1029,12 @@ int bch2_trigger_alloc(struct btree_trans *trans, } if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) { - rcu_read_lock(); + guard(rcu)(); struct bucket *g = gc_bucket(ca, new.k->p.offset); - if (unlikely(!g)) { - rcu_read_unlock(); + if (unlikely(!g)) goto invalid_bucket; - } g->gen_valid = 1; g->gen = new_a->gen; - rcu_read_unlock(); } err: fsck_err: @@ -1039,7 +1044,7 @@ fsck_err: invalid_bucket: bch2_fs_inconsistent(c, "reference to invalid bucket\n%s", (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)); - ret = -BCH_ERR_trigger_alloc; + ret = bch_err_throw(c, trigger_alloc); goto err; } @@ -1105,13 +1110,12 @@ static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *buck bucket->offset = 0; } - rcu_read_lock(); + guard(rcu)(); *ca = __bch2_next_dev_idx(c, bucket->inode, NULL); if (*ca) { *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket); bch2_dev_get(*ca); } - rcu_read_unlock(); return *ca != NULL; } @@ -1402,6 +1406,9 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite : BCH_DATA_free; struct printbuf buf = PRINTBUF; + unsigned fsck_flags = (async_repair ? FSCK_ERR_NO_LOG : 0)| + FSCK_CAN_FIX|FSCK_CAN_IGNORE; + struct bpos bucket = iter->pos; bucket.offset &= ~(~0ULL << 56); u64 genbits = iter->pos.offset & (~0ULL << 56); @@ -1415,9 +1422,10 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite return ret; if (!bch2_dev_bucket_exists(c, bucket)) { - if (fsck_err(trans, need_discard_freespace_key_to_invalid_dev_bucket, - "entry in %s btree for nonexistant dev:bucket %llu:%llu", - bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset)) + if (__fsck_err(trans, fsck_flags, + need_discard_freespace_key_to_invalid_dev_bucket, + "entry in %s btree for nonexistant dev:bucket %llu:%llu", + bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset)) goto delete; ret = 1; goto out; @@ -1429,7 +1437,8 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite if (a->data_type != state || (state == BCH_DATA_free && genbits != alloc_freespace_genbits(*a))) { - if (fsck_err(trans, need_discard_freespace_key_bad, + if (__fsck_err(trans, fsck_flags, + need_discard_freespace_key_bad, "%s\nincorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), bch2_btree_id_str(iter->btree_id), @@ -1454,7 +1463,7 @@ delete: ret = bch2_btree_bit_mod_iter(trans, iter, false) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_commit; + bch_err_throw(c, transaction_restart_commit); goto out; } else { /* @@ -1777,14 +1786,16 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress) { + struct bch_fs *c = ca->fs; int ret; mutex_lock(&ca->discard_buckets_in_flight_lock); - darray_for_each(ca->discard_buckets_in_flight, i) - if (i->bucket == bucket) { - ret = -BCH_ERR_EEXIST_discard_in_flight_add; - goto out; - } + struct discard_in_flight *i = + darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket); + if (i) { + ret = bch_err_throw(c, EEXIST_discard_in_flight_add); + goto out; + } ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) { .in_progress = in_progress, @@ -1798,14 +1809,11 @@ out: static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket) { mutex_lock(&ca->discard_buckets_in_flight_lock); - darray_for_each(ca->discard_buckets_in_flight, i) - if (i->bucket == bucket) { - BUG_ON(!i->in_progress); - darray_remove_item(&ca->discard_buckets_in_flight, i); - goto found; - } - BUG(); -found: + struct discard_in_flight *i = + darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket); + BUG_ON(!i || !i->in_progress); + + darray_remove_item(&ca->discard_buckets_in_flight, i); mutex_unlock(&ca->discard_buckets_in_flight_lock); } @@ -2504,7 +2512,7 @@ void bch2_recalc_capacity(struct bch_fs *c) lockdep_assert_held(&c->state_lock); - rcu_read_lock(); + guard(rcu)(); for_each_member_device_rcu(c, ca, NULL) { struct block_device *bdev = READ_ONCE(ca->disk_sb.bdev); if (bdev) @@ -2549,7 +2557,6 @@ void bch2_recalc_capacity(struct bch_fs *c) bucket_size_max = max_t(unsigned, bucket_size_max, ca->mi.bucket_size); } - rcu_read_unlock(); bch2_set_ra_pages(c, ra_pages); @@ -2574,10 +2581,9 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *c) { u64 ret = U64_MAX; - rcu_read_lock(); + guard(rcu)(); for_each_rw_member_rcu(c, ca) ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); - rcu_read_unlock(); return ret; } diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 4f94c6a661bf..0cc5adc55b6f 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -13,11 +13,9 @@ static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) { - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); - bool ret = ca && bucket_valid(ca, pos.offset); - rcu_read_unlock(); - return ret; + return ca && bucket_valid(ca, pos.offset); } static inline u64 bucket_to_u64(struct bpos bucket) @@ -253,6 +251,7 @@ int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); void bch2_alloc_v4_swab(struct bkey_s); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +void bch2_alloc_v4_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc ((struct bkey_ops) { \ .key_validate = bch2_alloc_v1_validate, \ @@ -277,7 +276,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \ .key_validate = bch2_alloc_v4_validate, \ - .val_to_text = bch2_alloc_to_text, \ + .val_to_text = bch2_alloc_v4_to_text, \ .swab = bch2_alloc_v4_swab, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 48, \ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 1a52c12c51ae..b58525ec7b4d 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -69,10 +69,9 @@ const char * const bch2_watermarks[] = { void bch2_reset_alloc_cursors(struct bch_fs *c) { - rcu_read_lock(); + guard(rcu)(); for_each_member_device_rcu(c, ca, NULL) memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor)); - rcu_read_unlock(); } static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) @@ -166,9 +165,8 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) ARRAY_SIZE(c->open_buckets_partial)); spin_lock(&c->freelist_lock); - rcu_read_lock(); - bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++; - rcu_read_unlock(); + scoped_guard(rcu) + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++; ob->on_partial_list = true; c->open_buckets_partial[c->open_buckets_partial_nr++] = @@ -229,7 +227,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true); spin_unlock(&c->freelist_lock); - return ERR_PTR(-BCH_ERR_open_buckets_empty); + return ERR_PTR(bch_err_throw(c, open_buckets_empty)); } /* Recheck under lock: */ @@ -513,7 +511,8 @@ again: bch2_dev_usage_read_fast(ca, &req->usage); avail = dev_buckets_free(ca, req->usage, req->watermark); - if (req->usage.buckets[BCH_DATA_need_discard] > avail) + if (req->usage.buckets[BCH_DATA_need_discard] > + min(avail, ca->mi.nbuckets >> 7)) bch2_dev_do_discards(ca); if (req->usage.buckets[BCH_DATA_need_gc_gens] > avail) @@ -535,7 +534,7 @@ again: track_event_change(&c->times[BCH_TIME_blocked_allocate], true); - ob = ERR_PTR(-BCH_ERR_freelist_empty); + ob = ERR_PTR(bch_err_throw(c, freelist_empty)); goto err; } @@ -560,7 +559,7 @@ alloc: } err: if (!ob) - ob = ERR_PTR(-BCH_ERR_no_buckets_found); + ob = ERR_PTR(bch_err_throw(c, no_buckets_found)); if (!IS_ERR(ob)) ob->data_type = req->data_type; @@ -603,18 +602,18 @@ static int __dev_stripe_cmp(struct dev_stripe_state *stripe, #define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) -struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, - struct dev_stripe_state *stripe, - struct bch_devs_mask *devs) +void bch2_dev_alloc_list(struct bch_fs *c, + struct dev_stripe_state *stripe, + struct bch_devs_mask *devs, + struct dev_alloc_list *ret) { - struct dev_alloc_list ret = { .nr = 0 }; - unsigned i; + ret->nr = 0; + unsigned i; for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) - ret.data[ret.nr++] = i; + ret->data[ret->nr++] = i; - bubble_sort(ret.data, ret.nr, dev_stripe_cmp); - return ret; + bubble_sort(ret->data, ret->nr, dev_stripe_cmp); } static const u64 stripe_clock_hand_rescale = 1ULL << 62; /* trigger rescale at */ @@ -705,18 +704,19 @@ static int add_new_bucket(struct bch_fs *c, return 0; } -int bch2_bucket_alloc_set_trans(struct btree_trans *trans, - struct alloc_request *req, - struct dev_stripe_state *stripe, - struct closure *cl) +inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans, + struct alloc_request *req, + struct dev_stripe_state *stripe, + struct closure *cl) { struct bch_fs *c = trans->c; - int ret = -BCH_ERR_insufficient_devices; + int ret = 0; BUG_ON(req->nr_effective >= req->nr_replicas); - struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc); - darray_for_each(devs_sorted, i) { + bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc, &req->devs_sorted); + + darray_for_each(req->devs_sorted, i) { req->ca = bch2_dev_tryget_noerror(c, *i); if (!req->ca) continue; @@ -739,13 +739,16 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - if (add_new_bucket(c, req, ob)) { - ret = 0; + ret = add_new_bucket(c, req, ob); + if (ret) break; - } } - return ret; + if (ret == 1) + return 0; + if (ret) + return ret; + return bch_err_throw(c, insufficient_devices); } /* Allocate from stripes: */ @@ -776,9 +779,9 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, if (!h) return 0; - struct dev_alloc_list devs_sorted = - bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc); - darray_for_each(devs_sorted, i) + bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc, &req->devs_sorted); + + darray_for_each(req->devs_sorted, i) for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { if (!h->s->blocks[ec_idx]) continue; @@ -872,9 +875,8 @@ static int bucket_alloc_set_partial(struct bch_fs *c, i); ob->on_partial_list = false; - rcu_read_lock(); - bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; - rcu_read_unlock(); + scoped_guard(rcu) + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; ret = add_new_bucket(c, req, ob); if (ret) @@ -1056,9 +1058,8 @@ void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, ob->on_partial_list = false; - rcu_read_lock(); - bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; - rcu_read_unlock(); + scoped_guard(rcu) + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; spin_unlock(&c->freelist_lock); bch2_open_bucket_put(c, ob); @@ -1086,14 +1087,11 @@ static struct write_point *__writepoint_find(struct hlist_head *head, { struct write_point *wp; - rcu_read_lock(); + guard(rcu)(); hlist_for_each_entry_rcu(wp, head, node) if (wp->write_point == write_point) - goto out; - wp = NULL; -out: - rcu_read_unlock(); - return wp; + return wp; + return NULL; } static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) @@ -1104,7 +1102,7 @@ static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) return stranded * factor > free; } -static bool try_increase_writepoints(struct bch_fs *c) +static noinline bool try_increase_writepoints(struct bch_fs *c) { struct write_point *wp; @@ -1117,7 +1115,7 @@ static bool try_increase_writepoints(struct bch_fs *c) return true; } -static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr) +static noinline bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr) { struct bch_fs *c = trans->c; struct write_point *wp; @@ -1379,11 +1377,11 @@ err: goto retry; if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) - ret = -BCH_ERR_bucket_alloc_blocked; + ret = bch_err_throw(c, bucket_alloc_blocked); if (cl && !(flags & BCH_WRITE_alloc_nowait) && bch2_err_matches(ret, BCH_ERR_freelist_empty)) - ret = -BCH_ERR_bucket_alloc_blocked; + ret = bch_err_throw(c, bucket_alloc_blocked); return ret; } @@ -1637,19 +1635,16 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c) bch2_printbuf_make_room(&buf, 4096); - rcu_read_lock(); buf.atomic++; - - for_each_online_member_rcu(c, ca) { - prt_printf(&buf, "Dev %u:\n", ca->dev_idx); - printbuf_indent_add(&buf, 2); - bch2_dev_alloc_debug_to_text(&buf, ca); - printbuf_indent_sub(&buf, 2); - prt_newline(&buf); - } - + scoped_guard(rcu) + for_each_online_member_rcu(c, ca) { + prt_printf(&buf, "Dev %u:\n", ca->dev_idx); + printbuf_indent_add(&buf, 2); + bch2_dev_alloc_debug_to_text(&buf, ca); + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + } --buf.atomic; - rcu_read_unlock(); prt_printf(&buf, "Copygc debug:\n"); printbuf_indent_add(&buf, 2); diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 2e01c7b61ed1..1b3fc8460096 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -42,6 +42,7 @@ struct alloc_request { struct bch_devs_mask devs_may_alloc; /* bch2_bucket_alloc_set_trans(): */ + struct dev_alloc_list devs_sorted; struct bch_dev_usage usage; /* bch2_bucket_alloc_trans(): */ @@ -71,9 +72,10 @@ struct alloc_request { struct bch_devs_mask scratch_devs_may_alloc; }; -struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, - struct dev_stripe_state *, - struct bch_devs_mask *); +void bch2_dev_alloc_list(struct bch_fs *, + struct dev_stripe_state *, + struct bch_devs_mask *, + struct dev_alloc_list *); void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index cde7dd115267..77d93beb3c8f 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -48,17 +48,19 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); - if (ca) { - u32 bucket_offset; - struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); - rcu_read_unlock(); + struct bch_dev *ca; + u32 bucket_offset; + struct bpos bucket; + scoped_guard(rcu) { + ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); + if (ca) + bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); + } + + if (ca) prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset); - } else { - rcu_read_unlock(); + else prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT); - } bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); prt_str(out, " data_type="); @@ -140,7 +142,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, } if (!will_check && __bch2_inconsistent_error(c, &buf)) - ret = -BCH_ERR_erofs_unfixed_errors; + ret = bch_err_throw(c, erofs_unfixed_errors); bch_err(c, "%s", buf.buf); printbuf_exit(&buf); @@ -293,7 +295,7 @@ static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans, return b; if (btree_node_will_make_reachable(b)) { - b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); + b = ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node)); } else { int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed, commit); @@ -591,6 +593,7 @@ check_existing_bp: bkey_for_each_ptr(other_extent_ptrs, ptr) if (ptr->dev == bp->k.p.inode && dev_ptr_stale_rcu(ca, ptr)) { + rcu_read_unlock(); ret = drop_dev_and_update(trans, other_bp.v->btree_id, other_extent, bp->k.p.inode); if (ret) @@ -648,7 +651,7 @@ check_existing_bp: prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, other_extent); bch_err(c, "%s", buf.buf); - ret = -BCH_ERR_fsck_repair_unimplemented; + ret = bch_err_throw(c, fsck_repair_unimplemented); goto err; missing: printbuf_reset(&buf); @@ -679,26 +682,23 @@ static int check_extent_to_backpointers(struct btree_trans *trans, if (p.ptr.dev == BCH_SB_MEMBER_INVALID) continue; - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); - if (!ca) { - rcu_read_unlock(); - continue; - } + bool empty; + { + /* scoped_guard() is a loop, so it breaks continue */ + guard(rcu)(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); + if (!ca) + continue; - if (p.ptr.cached && dev_ptr_stale_rcu(ca, &p.ptr)) { - rcu_read_unlock(); - continue; - } + if (p.ptr.cached && dev_ptr_stale_rcu(ca, &p.ptr)) + continue; - u64 b = PTR_BUCKET_NR(ca, &p.ptr); - if (!bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b)) { - rcu_read_unlock(); - continue; - } + u64 b = PTR_BUCKET_NR(ca, &p.ptr); + if (!bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b)) + continue; - bool empty = bch2_bucket_bitmap_test(&ca->bucket_backpointer_empty, b); - rcu_read_unlock(); + empty = bch2_bucket_bitmap_test(&ca->bucket_backpointer_empty, b); + } struct bkey_i_backpointer bp; bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); @@ -953,7 +953,7 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b sectors[ALLOC_cached] > a->cached_sectors || sectors[ALLOC_stripe] > a->stripe_sectors) { ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: - -BCH_ERR_transaction_restart_nested; + bch_err_throw(c, transaction_restart_nested); goto err; } @@ -981,7 +981,7 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) case KEY_TYPE_btree_ptr_v2: { bool ret = false; - rcu_read_lock(); + guard(rcu)(); struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key; while (pos.inode <= k.k->p.inode) { if (pos.inode >= c->sb.nr_devices) @@ -1009,7 +1009,6 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) next: pos = SPOS(pos.inode + 1, 0, 0); } - rcu_read_unlock(); return ret; } @@ -1352,7 +1351,7 @@ static int bch2_bucket_bitmap_set(struct bch_dev *ca, struct bucket_bitmap *b, u b->buckets = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), sizeof(unsigned long), GFP_KERNEL); if (!b->buckets) - return -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; + return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap); } b->nr += !__test_and_set_bit(bit, b->buckets); @@ -1361,7 +1360,8 @@ static int bch2_bucket_bitmap_set(struct bch_dev *ca, struct bucket_bitmap *b, u return 0; } -int bch2_bucket_bitmap_resize(struct bucket_bitmap *b, u64 old_size, u64 new_size) +int bch2_bucket_bitmap_resize(struct bch_dev *ca, struct bucket_bitmap *b, + u64 old_size, u64 new_size) { scoped_guard(mutex, &b->lock) { if (!b->buckets) @@ -1370,7 +1370,7 @@ int bch2_bucket_bitmap_resize(struct bucket_bitmap *b, u64 old_size, u64 new_siz unsigned long *n = kvcalloc(BITS_TO_LONGS(new_size), sizeof(unsigned long), GFP_KERNEL); if (!n) - return -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; + return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap); memcpy(n, b->buckets, BITS_TO_LONGS(min(old_size, new_size)) * sizeof(unsigned long)); diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 6840561084ce..7e71afee1ac0 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -53,11 +53,10 @@ static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca, static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) { - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode); if (ca) *bucket = bp_pos_to_bucket(ca, bp_pos); - rcu_read_unlock(); return ca != NULL; } @@ -195,7 +194,7 @@ static inline bool bch2_bucket_bitmap_test(struct bucket_bitmap *b, u64 i) return bitmap && test_bit(i, bitmap); } -int bch2_bucket_bitmap_resize(struct bucket_bitmap *, u64, u64); +int bch2_bucket_bitmap_resize(struct bch_dev *, struct bucket_bitmap *, u64, u64); void bch2_bucket_bitmap_free(struct bucket_bitmap *); #endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 7824da2af9d0..ddfacad0f70c 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -183,6 +183,16 @@ #define pr_fmt(fmt) "%s() " fmt "\n", __func__ #endif +#ifdef CONFIG_BCACHEFS_DEBUG +#define ENUMERATED_REF_DEBUG +#endif + +#ifndef dynamic_fault +#define dynamic_fault(...) 0 +#endif + +#define race_fault(...) dynamic_fault("bcachefs:race") + #include <linux/backing-dev-defs.h> #include <linux/bug.h> #include <linux/bio.h> @@ -219,15 +229,30 @@ #include "time_stats.h" #include "util.h" -#ifdef CONFIG_BCACHEFS_DEBUG -#define ENUMERATED_REF_DEBUG -#endif - -#ifndef dynamic_fault -#define dynamic_fault(...) 0 -#endif +#include "alloc_types.h" +#include "async_objs_types.h" +#include "btree_gc_types.h" +#include "btree_types.h" +#include "btree_node_scan_types.h" +#include "btree_write_buffer_types.h" +#include "buckets_types.h" +#include "buckets_waiting_for_journal_types.h" +#include "clock_types.h" +#include "disk_groups_types.h" +#include "ec_types.h" +#include "enumerated_ref_types.h" +#include "journal_types.h" +#include "keylist_types.h" +#include "quota_types.h" +#include "rebalance_types.h" +#include "recovery_passes_types.h" +#include "replicas_types.h" +#include "sb-members_types.h" +#include "subvolume_types.h" +#include "super_types.h" +#include "thread_with_file_types.h" -#define race_fault(...) dynamic_fault("bcachefs:race") +#include "trace.h" #define count_event(_c, _name) this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]) @@ -271,7 +296,6 @@ do { \ #define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") void bch2_print_str(struct bch_fs *, const char *, const char *); -void bch2_print_str_nonblocking(struct bch_fs *, const char *, const char *); __printf(2, 3) void bch2_print_opts(struct bch_opts *, const char *, ...); @@ -380,6 +404,14 @@ do { \ pr_info(fmt, ##__VA_ARGS__); \ } while (0) +static inline int __bch2_err_trace(struct bch_fs *c, int err) +{ + trace_error_throw(c, err, _THIS_IP_); + return err; +} + +#define bch_err_throw(_c, _err) __bch2_err_trace(_c, -BCH_ERR_##_err) + /* Parameters that are useful for debugging, but should always be compiled in: */ #define BCH_DEBUG_PARAMS_ALWAYS() \ BCH_DEBUG_PARAM(key_merging_disabled, \ @@ -486,29 +518,6 @@ enum bch_time_stats { BCH_TIME_STAT_NR }; -#include "alloc_types.h" -#include "async_objs_types.h" -#include "btree_gc_types.h" -#include "btree_types.h" -#include "btree_node_scan_types.h" -#include "btree_write_buffer_types.h" -#include "buckets_types.h" -#include "buckets_waiting_for_journal_types.h" -#include "clock_types.h" -#include "disk_groups_types.h" -#include "ec_types.h" -#include "enumerated_ref_types.h" -#include "journal_types.h" -#include "keylist_types.h" -#include "quota_types.h" -#include "rebalance_types.h" -#include "recovery_passes_types.h" -#include "replicas_types.h" -#include "sb-members_types.h" -#include "subvolume_types.h" -#include "super_types.h" -#include "thread_with_file_types.h" - /* Number of nodes btree coalesce will try to coalesce at once */ #define GC_MERGE_NODES 4U @@ -758,7 +767,8 @@ struct btree_trans_buf { x(sysfs) \ x(btree_write_buffer) \ x(btree_node_scrub) \ - x(async_recovery_passes) + x(async_recovery_passes) \ + x(ioctl_data) enum bch_write_ref { #define x(n) BCH_WRITE_REF_##n, @@ -853,9 +863,7 @@ struct bch_fs { DARRAY(enum bcachefs_metadata_version) incompat_versions_requested; -#ifdef CONFIG_UNICODE struct unicode_map *cf_encoding; -#endif struct bch_sb_handle disk_sb; @@ -1275,4 +1283,13 @@ static inline bool bch2_discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca : ca->mi.discard; } +static inline bool bch2_fs_casefold_enabled(struct bch_fs *c) +{ +#ifdef CONFIG_UNICODE + return !c->opts.casefold_disabled; +#else + return false; +#endif +} + #endif /* _BCACHEFS_H */ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 8557cbd3d818..83c9860e6b82 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -85,7 +85,7 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) six_unlock_intent(&b->c.lock); } -static void __btree_node_data_free(struct btree_cache *bc, struct btree *b) +void __btree_node_data_free(struct btree *b) { BUG_ON(!list_empty(&b->list)); BUG_ON(btree_node_hashed(b)); @@ -112,16 +112,17 @@ static void __btree_node_data_free(struct btree_cache *bc, struct btree *b) munmap(b->aux_data, btree_aux_data_bytes(b)); #endif b->aux_data = NULL; - - btree_node_to_freedlist(bc, b); } static void btree_node_data_free(struct btree_cache *bc, struct btree *b) { BUG_ON(list_empty(&b->list)); list_del_init(&b->list); + + __btree_node_data_free(b); + --bc->nr_freeable; - __btree_node_data_free(bc, b); + btree_node_to_freedlist(bc, b); } static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, @@ -149,7 +150,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) b->data = kvmalloc(btree_buf_bytes(b), gfp); if (!b->data) - return -BCH_ERR_ENOMEM_btree_node_mem_alloc; + return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); #ifdef __KERNEL__ b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); #else @@ -162,7 +163,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) if (!b->aux_data) { kvfree(b->data); b->data = NULL; - return -BCH_ERR_ENOMEM_btree_node_mem_alloc; + return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); } return 0; @@ -185,10 +186,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) { - struct btree_cache *bc = &c->btree_cache; - struct btree *b; - - b = __btree_node_mem_alloc(c, GFP_KERNEL); + struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL); if (!b) return NULL; @@ -198,8 +196,6 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) } bch2_btree_lock_init(&b->c, 0, GFP_KERNEL); - - __bch2_btree_node_to_freelist(bc, b); return b; } @@ -353,21 +349,21 @@ static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b, if (btree_node_noevict(b)) { bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++; - return -BCH_ERR_ENOMEM_btree_node_reclaim; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); } if (btree_node_write_blocked(b)) { bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++; - return -BCH_ERR_ENOMEM_btree_node_reclaim; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); } if (btree_node_will_make_reachable(b)) { bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++; - return -BCH_ERR_ENOMEM_btree_node_reclaim; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); } if (btree_node_dirty(b)) { if (!flush) { bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++; - return -BCH_ERR_ENOMEM_btree_node_reclaim; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); } if (locked) { @@ -393,7 +389,7 @@ static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b, bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++; else if (btree_node_write_in_flight(b)) bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++; - return -BCH_ERR_ENOMEM_btree_node_reclaim; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); } if (locked) @@ -424,13 +420,13 @@ retry_unlocked: if (!six_trylock_intent(&b->c.lock)) { bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++; - return -BCH_ERR_ENOMEM_btree_node_reclaim; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); } if (!six_trylock_write(&b->c.lock)) { bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++; six_unlock_intent(&b->c.lock); - return -BCH_ERR_ENOMEM_btree_node_reclaim; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); } /* recheck under lock */ @@ -524,7 +520,8 @@ restart: --touched;; } else if (!btree_node_reclaim(c, b)) { __bch2_btree_node_hash_remove(bc, b); - __btree_node_data_free(bc, b); + __btree_node_data_free(b); + btree_node_to_freedlist(bc, b); freed++; bc->nr_freed++; @@ -652,9 +649,12 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bch2_recalc_btree_reserve(c); - for (i = 0; i < bc->nr_reserve; i++) - if (!__bch2_btree_node_mem_alloc(c)) + for (i = 0; i < bc->nr_reserve; i++) { + struct btree *b = __bch2_btree_node_mem_alloc(c); + if (!b) goto err; + __bch2_btree_node_to_freelist(bc, b); + } list_splice_init(&bc->live[0].list, &bc->freeable); @@ -682,7 +682,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) return 0; err: - return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return bch_err_throw(c, ENOMEM_fs_btree_cache_init); } void bch2_fs_btree_cache_init_early(struct btree_cache *bc) @@ -727,7 +727,7 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure if (!cl) { trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); - return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; + return bch_err_throw(c, ENOMEM_btree_cache_cannibalize_lock); } closure_wait(&bc->alloc_wait, cl); @@ -741,7 +741,7 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure } trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); - return -BCH_ERR_btree_cache_cannibalize_lock_blocked; + return bch_err_throw(c, btree_cache_cannibalize_lock_blocked); success: trace_and_count(c, btree_cache_cannibalize_lock, trans); diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index ca3c1b145330..be275f87a60e 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -30,6 +30,7 @@ void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsig void bch2_btree_cache_cannibalize_unlock(struct btree_trans *); int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *); +void __btree_node_data_free(struct btree *); struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 91b6395421df..bac108e93823 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -150,7 +150,7 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); if (!new) - return -BCH_ERR_ENOMEM_gc_repair_key; + return bch_err_throw(c, ENOMEM_gc_repair_key); btree_ptr_to_v2(b, new); b->data->min_key = new_min; @@ -190,7 +190,7 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); if (!new) - return -BCH_ERR_ENOMEM_gc_repair_key; + return bch_err_throw(c, ENOMEM_gc_repair_key); btree_ptr_to_v2(b, new); b->data->max_key = new_max; @@ -397,7 +397,11 @@ again: continue; } - ret = btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan); + ret = lockrestart_do(trans, + btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan)); + if (ret < 0) + goto err; + if (ret == DID_FILL_FROM_SCAN) { new_pass = true; ret = 0; @@ -438,7 +442,8 @@ again: if (!ret && !IS_ERR_OR_NULL(prev)) { BUG_ON(cur); - ret = btree_repair_node_end(trans, b, prev, pulled_from_scan); + ret = lockrestart_do(trans, + btree_repair_node_end(trans, b, prev, pulled_from_scan)); if (ret == DID_FILL_FROM_SCAN) { new_pass = true; ret = 0; @@ -498,8 +503,14 @@ again: prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + /* + * XXX: we're not passing the trans object here because we're not set up + * to handle a transaction restart - this code needs to be rewritten + * when we start doing online topology repair + */ + bch2_trans_unlock_long(trans); if (mustfix_fsck_err_on(!have_child, - trans, btree_node_topology_interior_node_empty, + c, btree_node_topology_interior_node_empty, "empty interior btree node at %s", buf.buf)) ret = DROP_THIS_NODE; err: @@ -519,49 +530,72 @@ fsck_err: bch2_bkey_buf_exit(&prev_k, c); bch2_bkey_buf_exit(&cur_k, c); printbuf_exit(&buf); + bch_err_fn(c, ret); return ret; } -int bch2_check_topology(struct bch_fs *c) +static int bch2_check_root(struct btree_trans *trans, enum btree_id btree, + bool *reconstructed_root) { - struct btree_trans *trans = bch2_trans_get(c); - struct bpos pulled_from_scan = POS_MIN; + struct bch_fs *c = trans->c; + struct btree_root *r = bch2_btree_id_root(c, btree); struct printbuf buf = PRINTBUF; int ret = 0; - bch2_trans_srcu_unlock(trans); + bch2_btree_id_to_text(&buf, btree); - for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { - struct btree_root *r = bch2_btree_id_root(c, i); - bool reconstructed_root = false; + if (r->error) { + bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); - printbuf_reset(&buf); - bch2_btree_id_to_text(&buf, i); + ret = bch2_btree_has_scanned_nodes(c, btree); + if (ret < 0) + goto err; - if (r->error) { -reconstruct_root: - bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); + if (!ret) { + __fsck_err(trans, + FSCK_CAN_FIX|(!btree_id_important(btree) ? FSCK_AUTOFIX : 0), + btree_root_unreadable_and_scan_found_nothing, + "no nodes found for btree %s, continue?", buf.buf); r->alive = false; r->error = 0; + bch2_btree_root_alloc_fake_trans(trans, btree, 0); + } else { + r->alive = false; + r->error = 0; + bch2_btree_root_alloc_fake_trans(trans, btree, 1); - if (!bch2_btree_has_scanned_nodes(c, i)) { - __fsck_err(trans, - FSCK_CAN_FIX|(!btree_id_important(i) ? FSCK_AUTOFIX : 0), - btree_root_unreadable_and_scan_found_nothing, - "no nodes found for btree %s, continue?", buf.buf); - bch2_btree_root_alloc_fake_trans(trans, i, 0); - } else { - bch2_btree_root_alloc_fake_trans(trans, i, 1); - bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); - ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); - if (ret) - break; - } - - reconstructed_root = true; + bch2_shoot_down_journal_keys(c, btree, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + ret = bch2_get_scanned_nodes(c, btree, 0, POS_MIN, SPOS_MAX); + if (ret) + goto err; } + *reconstructed_root = true; + } +err: +fsck_err: + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} + +int bch2_check_topology(struct bch_fs *c) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct bpos pulled_from_scan = POS_MIN; + int ret = 0; + + bch2_trans_srcu_unlock(trans); + + for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { + bool reconstructed_root = false; +recover: + ret = lockrestart_do(trans, bch2_check_root(trans, i, &reconstructed_root)); + if (ret) + break; + + struct btree_root *r = bch2_btree_id_root(c, i); struct btree *b = r->b; btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); @@ -575,17 +609,21 @@ reconstruct_root: r->b = NULL; - if (!reconstructed_root) - goto reconstruct_root; + if (!reconstructed_root) { + r->error = -EIO; + goto recover; + } + struct printbuf buf = PRINTBUF; + bch2_btree_id_to_text(&buf, i); bch_err(c, "empty btree root %s", buf.buf); + printbuf_exit(&buf); bch2_btree_root_alloc_fake_trans(trans, i, 0); r->alive = false; ret = 0; } } -fsck_err: - printbuf_exit(&buf); + bch2_trans_put(trans); return ret; } @@ -935,7 +973,7 @@ static int bch2_gc_alloc_start(struct bch_fs *c) ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL); if (ret) { bch2_dev_put(ca); - ret = -BCH_ERR_ENOMEM_gc_alloc_start; + ret = bch_err_throw(c, ENOMEM_gc_alloc_start); break; } } @@ -1093,42 +1131,41 @@ static int gc_btree_gens_key(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bkey_i *u; - int ret; if (unlikely(test_bit(BCH_FS_going_ro, &c->flags))) return -EROFS; - rcu_read_lock(); - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ca) - continue; + bool too_stale = false; + scoped_guard(rcu) { + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca) + continue; - if (dev_ptr_stale(ca, ptr) > 16) { - rcu_read_unlock(); - goto update; + too_stale |= dev_ptr_stale(ca, ptr) > 16; } + + if (!too_stale) + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca) + continue; + + u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; + if (gen_after(*gen, ptr->gen)) + *gen = ptr->gen; + } } - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ca) - continue; + if (too_stale) { + struct bkey_i *u = bch2_bkey_make_mut(trans, iter, &k, 0); + int ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; - u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; - if (gen_after(*gen, ptr->gen)) - *gen = ptr->gen; + bch2_extent_normalize(c, bkey_i_to_s(u)); } - rcu_read_unlock(); - return 0; -update: - u = bch2_bkey_make_mut(trans, iter, &k, 0); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - return ret; - bch2_extent_normalize(c, bkey_i_to_s(u)); return 0; } @@ -1181,7 +1218,7 @@ int bch2_gc_gens(struct bch_fs *c) ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL); if (!ca->oldest_gen) { bch2_dev_put(ca); - ret = -BCH_ERR_ENOMEM_gc_gens; + ret = bch_err_throw(c, ENOMEM_gc_gens); goto err; } diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 34018296053a..590cd29f3e86 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -557,7 +557,9 @@ static int __btree_err(int ret, const char *fmt, ...) { if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) - return -BCH_ERR_fsck_fix; + return ret == -BCH_ERR_btree_node_read_err_fixable + ? bch_err_throw(c, fsck_fix) + : ret; bool have_retry = false; int ret2; @@ -566,15 +568,15 @@ static int __btree_err(int ret, bch2_mark_btree_validate_failure(failed, ca->dev_idx); struct extent_ptr_decoded pick; - have_retry = !bch2_bkey_pick_read_device(c, + have_retry = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), - failed, &pick, -1); + failed, &pick, -1) == 1; } if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry) - ret = -BCH_ERR_btree_node_read_err_fixable; + ret = bch_err_throw(c, btree_node_read_err_fixable); if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) - ret = -BCH_ERR_btree_node_read_err_bad_node; + ret = bch_err_throw(c, btree_node_read_err_bad_node); bch2_sb_error_count(c, err_type); @@ -602,18 +604,17 @@ static int __btree_err(int ret, switch (ret) { case -BCH_ERR_btree_node_read_err_fixable: ret2 = bch2_fsck_err_opt(c, FSCK_CAN_FIX, err_type); - if (ret2 != -BCH_ERR_fsck_fix && - ret2 != -BCH_ERR_fsck_ignore) { + if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) && + !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) { ret = ret2; goto fsck_err; } if (!have_retry) - ret = -BCH_ERR_fsck_fix; + ret = bch_err_throw(c, fsck_fix); goto out; case -BCH_ERR_btree_node_read_err_bad_node: prt_str(&out, ", "); - ret = __bch2_topology_error(c, &out); break; } @@ -631,18 +632,17 @@ static int __btree_err(int ret, switch (ret) { case -BCH_ERR_btree_node_read_err_fixable: ret2 = __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf); - if (ret2 != -BCH_ERR_fsck_fix && - ret2 != -BCH_ERR_fsck_ignore) { + if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) && + !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) { ret = ret2; goto fsck_err; } if (!have_retry) - ret = -BCH_ERR_fsck_fix; + ret = bch_err_throw(c, fsck_fix); goto out; case -BCH_ERR_btree_node_read_err_bad_node: prt_str(&out, ", "); - ret = __bch2_topology_error(c, &out); break; } print: @@ -660,7 +660,7 @@ fsck_err: failed, err_msg, \ msg, ##__VA_ARGS__); \ \ - if (_ret != -BCH_ERR_fsck_fix) { \ + if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix)) { \ ret = _ret; \ goto fsck_err; \ } \ @@ -723,12 +723,11 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) static int validate_bset(struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bset *i, - unsigned offset, unsigned sectors, int write, + unsigned offset, int write, struct bch_io_failures *failed, struct printbuf *err_msg) { unsigned version = le16_to_cpu(i->version); - unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); struct printbuf buf1 = PRINTBUF; struct printbuf buf2 = PRINTBUF; int ret = 0; @@ -741,16 +740,22 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, BCH_VERSION_MAJOR(version), BCH_VERSION_MINOR(version)); - if (btree_err_on(version < c->sb.version_min, + if (c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes && + btree_err_on(version < c->sb.version_min, -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, NULL, btree_node_bset_older_than_sb_min, "bset version %u older than superblock version_min %u", version, c->sb.version_min)) { - mutex_lock(&c->sb_lock); - c->disk_sb.sb->version_min = cpu_to_le16(version); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); + if (bch2_version_compatible(version)) { + mutex_lock(&c->sb_lock); + c->disk_sb.sb->version_min = cpu_to_le16(version); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } else { + /* We have no idea what's going on: */ + i->version = cpu_to_le16(c->sb.version); + } } if (btree_err_on(BCH_VERSION_MAJOR(version) > @@ -772,15 +777,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_node_unsupported_version, "BSET_SEPARATE_WHITEOUTS no longer supported"); - if (!write && - btree_err_on(offset + sectors > (ptr_written ?: btree_sectors(c)), - -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, NULL, - bset_past_end_of_btree_node, - "bset past end of btree node (offset %u len %u but written %zu)", - offset, sectors, ptr_written ?: btree_sectors(c))) - i->u64s = 0; - btree_err_on(offset && !i->u64s, -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, NULL, @@ -1045,6 +1041,7 @@ got_good_key: le16_add_cpu(&i->u64s, -next_good_key); memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k); set_btree_node_need_rewrite(b); + set_btree_node_need_rewrite_error(b); } fsck_err: printbuf_exit(&buf); @@ -1144,6 +1141,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, "unknown checksum type %llu", BSET_CSUM_TYPE(i)); if (first) { + sectors = vstruct_sectors(b->data, c->block_bits); + if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)), + -BCH_ERR_btree_node_read_err_fixable, + c, ca, b, i, NULL, + bset_past_end_of_btree_node, + "bset past end of btree node (offset %u len %u but written %zu)", + b->written, sectors, ptr_written ?: btree_sectors(c))) + i->u64s = 0; if (good_csum_type) { struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); bool csum_bad = bch2_crc_cmp(b->data->csum, csum); @@ -1171,9 +1176,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, c, NULL, b, NULL, NULL, btree_node_unsupported_version, "btree node does not have NEW_EXTENT_OVERWRITE set"); - - sectors = vstruct_sectors(b->data, c->block_bits); } else { + sectors = vstruct_sectors(bne, c->block_bits); + if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)), + -BCH_ERR_btree_node_read_err_fixable, + c, ca, b, i, NULL, + bset_past_end_of_btree_node, + "bset past end of btree node (offset %u len %u but written %zu)", + b->written, sectors, ptr_written ?: btree_sectors(c))) + i->u64s = 0; if (good_csum_type) { struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); bool csum_bad = bch2_crc_cmp(bne->csum, csum); @@ -1194,14 +1205,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, "decrypting btree node: %s", bch2_err_str(ret))) goto fsck_err; } - - sectors = vstruct_sectors(bne, c->block_bits); } b->version_ondisk = min(b->version_ondisk, le16_to_cpu(i->version)); - ret = validate_bset(c, ca, b, i, b->written, sectors, READ, failed, err_msg); + ret = validate_bset(c, ca, b, i, b->written, READ, failed, err_msg); if (ret) goto fsck_err; @@ -1286,9 +1295,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted); - if (updated_range) - bch2_btree_node_drop_keys_outside_node(b); - i = &b->data->keys; for (k = i->start; k != vstruct_last(i);) { struct bkey tmp; @@ -1305,6 +1311,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, (u64 *) vstruct_end(i) - (u64 *) k); set_btree_bset_end(b, b->set); set_btree_node_need_rewrite(b); + set_btree_node_need_rewrite_error(b); continue; } if (ret) @@ -1325,17 +1332,50 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_node_reset_sib_u64s(b); - rcu_read_lock(); - bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { - struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); + if (updated_range) + bch2_btree_node_drop_keys_outside_node(b); - if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) - set_btree_node_need_rewrite(b); + /* + * XXX: + * + * We deadlock if too many btree updates require node rewrites while + * we're still in journal replay. + * + * This is because btree node rewrites generate more updates for the + * interior updates (alloc, backpointers), and if those updates touch + * new nodes and generate more rewrites - well, you see the problem. + * + * The biggest cause is that we don't use the btree write buffer (for + * the backpointer updates - this needs some real thought on locking in + * order to fix. + * + * The problem with this workaround (not doing the rewrite for degraded + * nodes in journal replay) is that those degraded nodes persist, and we + * don't want that (this is a real bug when a btree node write completes + * with fewer replicas than we wanted and leaves a degraded node due to + * device _removal_, i.e. the device went away mid write). + * + * It's less of a bug here, but still a problem because we don't yet + * have a way of tracking degraded data - we another index (all + * extents/btree nodes, by replicas entry) in order to fix properly + * (re-replicate degraded data at the earliest possible time). + */ + if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) { + scoped_guard(rcu) + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { + struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); + + if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) { + set_btree_node_need_rewrite(b); + set_btree_node_need_rewrite_degraded(b); + } + } } - rcu_read_unlock(); - if (!ptr_written) + if (!ptr_written) { set_btree_node_need_rewrite(b); + set_btree_node_need_rewrite_ptr_written_zero(b); + } fsck_err: mempool_free(iter, &c->fill_iter); printbuf_exit(&buf); @@ -1366,7 +1406,7 @@ static void btree_node_read_work(struct work_struct *work) ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), &failed, &rb->pick, -1); - if (ret) { + if (ret <= 0) { set_btree_node_read_error(b); break; } @@ -1688,7 +1728,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool ra = kzalloc(sizeof(*ra), GFP_NOFS); if (!ra) - return -BCH_ERR_ENOMEM_btree_node_read_all_replicas; + return bch_err_throw(c, ENOMEM_btree_node_read_all_replicas); closure_init(&ra->cl, NULL); ra->c = c; @@ -1870,7 +1910,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, bch2_btree_node_hash_remove(&c->btree_cache, b); mutex_unlock(&c->btree_cache.lock); - ret = -BCH_ERR_btree_node_read_error; + ret = bch_err_throw(c, btree_node_read_error); goto err; } @@ -1971,28 +2011,12 @@ static void btree_node_scrub_work(struct work_struct *work) prt_newline(&err); if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) { - struct btree_trans *trans = bch2_trans_get(c); - - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, scrub->btree, - scrub->key.k->k.p, 0, scrub->level - 1, 0); - - struct btree *b; - int ret = lockrestart_do(trans, - PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(trans, &iter))); - if (ret) - goto err; - - if (bkey_i_to_btree_ptr_v2(&b->key)->v.seq == scrub->seq) { - bch_err(c, "error validating btree node during scrub on %s at btree %s", - scrub->ca->name, err.buf); - - ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0); - } -err: - bch2_trans_iter_exit(trans, &iter); - bch2_trans_begin(trans); - bch2_trans_put(trans); + int ret = bch2_trans_do(c, + bch2_btree_node_rewrite_key(trans, scrub->btree, scrub->level - 1, + scrub->key.k, 0)); + if (!bch2_err_matches(ret, ENOENT) && + !bch2_err_matches(ret, EROFS)) + bch_err_fn_ratelimited(c, ret); } printbuf_exit(&err); @@ -2020,7 +2044,7 @@ int bch2_btree_node_scrub(struct btree_trans *trans, struct bch_fs *c = trans->c; if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_node_scrub)) - return -BCH_ERR_erofs_no_writes; + return bch_err_throw(c, erofs_no_writes); struct extent_ptr_decoded pick; int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev); @@ -2030,7 +2054,7 @@ int bch2_btree_node_scrub(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_scrub); if (!ca) { - ret = -BCH_ERR_device_offline; + ret = bch_err_throw(c, device_offline); goto err; } @@ -2167,7 +2191,7 @@ static void btree_node_write_work(struct work_struct *work) bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) { - ret = -BCH_ERR_btree_node_write_all_failed; + ret = bch_err_throw(c, btree_node_write_all_failed); goto err; } @@ -2256,7 +2280,7 @@ static void btree_node_write_endio(struct bio *bio) } static int validate_bset_for_write(struct bch_fs *c, struct btree *b, - struct bset *i, unsigned sectors) + struct bset *i) { int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key), (struct bkey_validate_context) { @@ -2271,7 +2295,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, } ret = validate_bset_keys(c, b, i, WRITE, NULL, NULL) ?: - validate_bset(c, NULL, b, i, b->written, sectors, WRITE, NULL, NULL); + validate_bset(c, NULL, b, i, b->written, WRITE, NULL, NULL); if (ret) { bch2_inconsistent_error(c); dump_stack(); @@ -2464,7 +2488,7 @@ do_write: /* if we're going to be encrypting, check metadata validity first: */ if (validate_before_checksum && - validate_bset_for_write(c, b, i, sectors_to_write)) + validate_bset_for_write(c, b, i)) goto err; ret = bset_encrypt(c, i, b->written << 9); @@ -2481,7 +2505,7 @@ do_write: /* if we're not encrypting, check metadata after checksumming: */ if (!validate_before_checksum && - validate_bset_for_write(c, b, i, sectors_to_write)) + validate_bset_for_write(c, b, i)) goto err; /* diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index b4bf4217a3fa..f8829b667ad3 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -890,8 +890,7 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, struct btree_path *path, - unsigned flags, - struct bkey_buf *out) + unsigned flags) { struct bch_fs *c = trans->c; struct btree_path_level *l = path_l(path); @@ -915,7 +914,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, goto err; } - bch2_bkey_buf_reassemble(out, c, k); + bkey_reassemble(&trans->btree_path_down, k); if ((flags & BTREE_ITER_prefetch) && c->opts.btree_node_prefetch) @@ -926,6 +925,22 @@ err: return ret; } +static noinline_for_stack int btree_node_missing_err(struct btree_trans *trans, + struct btree_path *path) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "node not found at pos "); + bch2_bpos_to_text(&buf, path->pos); + prt_str(&buf, " within parent node "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&path_l(path)->b->key)); + + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); + return bch_err_throw(c, btree_need_topology_repair); +} + static __always_inline int btree_path_down(struct btree_trans *trans, struct btree_path *path, unsigned flags, @@ -936,51 +951,38 @@ static __always_inline int btree_path_down(struct btree_trans *trans, struct btree *b; unsigned level = path->level - 1; enum six_lock_type lock_type = __btree_lock_want(path, level); - struct bkey_buf tmp; int ret; EBUG_ON(!btree_node_locked(path, path->level)); - bch2_bkey_buf_init(&tmp); - if (unlikely(trans->journal_replay_not_finished)) { - ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp); + ret = btree_node_iter_and_journal_peek(trans, path, flags); if (ret) - goto err; + return ret; } else { struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b); - if (!k) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "node not found at pos "); - bch2_bpos_to_text(&buf, path->pos); - prt_str(&buf, " within parent node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key)); - - bch2_fs_fatal_error(c, "%s", buf.buf); - printbuf_exit(&buf); - ret = -BCH_ERR_btree_need_topology_repair; - goto err; - } + if (unlikely(!k)) + return btree_node_missing_err(trans, path); - bch2_bkey_buf_unpack(&tmp, c, l->b, k); + bch2_bkey_unpack(l->b, &trans->btree_path_down, k); - if ((flags & BTREE_ITER_prefetch) && + if (unlikely((flags & BTREE_ITER_prefetch)) && c->opts.btree_node_prefetch) { ret = btree_path_prefetch(trans, path); if (ret) - goto err; + return ret; } } - b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); + b = bch2_btree_node_get(trans, path, &trans->btree_path_down, + level, lock_type, trace_ip); ret = PTR_ERR_OR_ZERO(b); if (unlikely(ret)) - goto err; + return ret; - if (likely(!trans->journal_replay_not_finished && - tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && - unlikely(b != btree_node_mem_ptr(tmp.k))) + if (unlikely(b != btree_node_mem_ptr(&trans->btree_path_down)) && + likely(!trans->journal_replay_not_finished && + trans->btree_path_down.k.type == KEY_TYPE_btree_ptr_v2)) btree_node_mem_ptr_set(trans, path, level + 1, b); if (btree_node_read_locked(path, level + 1)) @@ -992,9 +994,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans, bch2_btree_path_level_init(trans, path, b); bch2_btree_path_verify_locks(trans, path); -err: - bch2_bkey_buf_exit(&tmp, c); - return ret; + return 0; } static int bch2_btree_path_traverse_all(struct btree_trans *trans) @@ -1006,7 +1006,7 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans) int ret = 0; if (trans->in_traverse_all) - return -BCH_ERR_transaction_restart_in_traverse_all; + return bch_err_throw(c, transaction_restart_in_traverse_all); trans->in_traverse_all = true; retry_all: @@ -2076,14 +2076,14 @@ inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter static noinline void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c *k) + struct bpos search_key, struct bkey_s_c *k) { struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key; trans_for_each_update(trans, i) if (!i->key_cache_already_flushed && i->btree_id == iter->btree_id && - bpos_le(i->k->k.p, iter->pos) && + bpos_le(i->k->k.p, search_key) && bpos_ge(i->k->k.p, k->k ? k->k->p : end)) { iter->k = i->k->k; *k = bkey_i_to_s_c(i->k); @@ -2092,6 +2092,7 @@ void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_ static noinline void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_key, struct bkey_s_c *k) { struct btree_path *path = btree_iter_path(trans, iter); @@ -2100,7 +2101,7 @@ void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter trans_for_each_update(trans, i) if (!i->key_cache_already_flushed && i->btree_id == iter->btree_id && - bpos_ge(i->k->k.p, path->pos) && + bpos_ge(i->k->k.p, search_key) && bpos_le(i->k->k.p, k->k ? k->k->p : end)) { iter->k = i->k->k; *k = bkey_i_to_s_c(i->k); @@ -2122,13 +2123,14 @@ void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_ static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_pos, struct bpos end_pos) { struct btree_path *path = btree_iter_path(trans, iter); return bch2_journal_keys_peek_max(trans->c, iter->btree_id, path->level, - path->pos, + search_pos, end_pos, &iter->journal_idx); } @@ -2138,7 +2140,7 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, struct btree_iter *iter) { struct btree_path *path = btree_iter_path(trans, iter); - struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos); + struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos); if (k) { iter->k = k->k; @@ -2151,11 +2153,12 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, static noinline void btree_trans_peek_journal(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_key, struct bkey_s_c *k) { struct btree_path *path = btree_iter_path(trans, iter); struct bkey_i *next_journal = - bch2_btree_journal_peek(trans, iter, + bch2_btree_journal_peek(trans, iter, search_key, k->k ? k->k->p : path_l(path)->b->key.k.p); if (next_journal) { iter->k = next_journal->k; @@ -2165,13 +2168,14 @@ void btree_trans_peek_journal(struct btree_trans *trans, static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_key, struct bpos end_pos) { struct btree_path *path = btree_iter_path(trans, iter); return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id, path->level, - path->pos, + search_key, end_pos, &iter->journal_idx); } @@ -2179,12 +2183,13 @@ static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, static noinline void btree_trans_peek_prev_journal(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_key, struct bkey_s_c *k) { struct btree_path *path = btree_iter_path(trans, iter); struct bkey_i *next_journal = - bch2_btree_journal_peek_prev(trans, iter, - k->k ? k->k->p : path_l(path)->b->key.k.p); + bch2_btree_journal_peek_prev(trans, iter, search_key, + k->k ? k->k->p : path_l(path)->b->data->min_key); if (next_journal) { iter->k = next_journal->k; @@ -2292,11 +2297,11 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct } if (unlikely(iter->flags & BTREE_ITER_with_journal)) - btree_trans_peek_journal(trans, iter, &k); + btree_trans_peek_journal(trans, iter, search_key, &k); if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) - bch2_btree_trans_peek_updates(trans, iter, &k); + bch2_btree_trans_peek_updates(trans, iter, search_key, &k); if (k.k && bkey_deleted(k.k)) { /* @@ -2326,6 +2331,20 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct } bch2_btree_iter_verify(trans, iter); + + if (trace___btree_iter_peek_enabled()) { + CLASS(printbuf, buf)(); + + int ret = bkey_err(k); + if (ret) + prt_str(&buf, bch2_err_str(ret)); + else if (k.k) + bch2_bkey_val_to_text(&buf, trans->c, k); + else + prt_str(&buf, "(null)"); + trace___btree_iter_peek(trans->c, buf.buf); + } + return k; } @@ -2484,6 +2503,19 @@ out_no_locked: bch2_btree_iter_verify_entry_exit(iter); + if (trace_btree_iter_peek_max_enabled()) { + CLASS(printbuf, buf)(); + + int ret = bkey_err(k); + if (ret) + prt_str(&buf, bch2_err_str(ret)); + else if (k.k) + bch2_bkey_val_to_text(&buf, trans->c, k); + else + prt_str(&buf, "(null)"); + trace_btree_iter_peek_max(trans->c, buf.buf); + } + return k; end: bch2_btree_iter_set_pos(trans, iter, end); @@ -2557,11 +2589,11 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, st } if (unlikely(iter->flags & BTREE_ITER_with_journal)) - btree_trans_peek_prev_journal(trans, iter, &k); + btree_trans_peek_prev_journal(trans, iter, search_key, &k); if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) - bch2_btree_trans_peek_prev_updates(trans, iter, &k); + bch2_btree_trans_peek_prev_updates(trans, iter, search_key, &k); if (likely(k.k && !bkey_deleted(k.k))) { break; @@ -2724,6 +2756,19 @@ out_no_locked: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(trans, iter); + + if (trace_btree_iter_peek_prev_min_enabled()) { + CLASS(printbuf, buf)(); + + int ret = bkey_err(k); + if (ret) + prt_str(&buf, bch2_err_str(ret)); + else if (k.k) + bch2_bkey_val_to_text(&buf, trans->c, k); + else + prt_str(&buf, "(null)"); + trace_btree_iter_peek_prev_min(trans->c, buf.buf); + } return k; end: bch2_btree_iter_set_pos(trans, iter, end); @@ -2767,8 +2812,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre /* extents can't span inode numbers: */ if ((iter->flags & BTREE_ITER_is_extents) && unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { - if (iter->pos.inode == KEY_INODE_MAX) - return bkey_s_c_null; + if (iter->pos.inode == KEY_INODE_MAX) { + k = bkey_s_c_null; + goto out2; + } bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos)); } @@ -2785,8 +2832,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre } struct btree_path *path = btree_iter_path(trans, iter); - if (unlikely(!btree_path_node(path, path->level))) - return bkey_s_c_null; + if (unlikely(!btree_path_node(path, path->level))) { + k = bkey_s_c_null; + goto out2; + } btree_path_set_should_be_locked(trans, path); @@ -2879,7 +2928,20 @@ out: bch2_btree_iter_verify(trans, iter); ret = bch2_btree_iter_verify_ret(trans, iter, k); if (unlikely(ret)) - return bkey_s_c_err(ret); + k = bkey_s_c_err(ret); +out2: + if (trace_btree_iter_peek_slot_enabled()) { + CLASS(printbuf, buf)(); + + int ret = bkey_err(k); + if (ret) + prt_str(&buf, bch2_err_str(ret)); + else if (k.k) + bch2_bkey_val_to_text(&buf, trans->c, k); + else + prt_str(&buf, "(null)"); + trace_btree_iter_peek_slot(trans->c, buf.buf); + } return k; } @@ -3132,6 +3194,10 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) { #ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "bump allocator exceeded BTREE_TRANS_MEM_MAX (%u)\n", + BTREE_TRANS_MEM_MAX); + bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace); bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); @@ -3159,46 +3225,32 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long mutex_unlock(&s->lock); } - if (trans->used_mempool) { - if (trans->mem_bytes >= new_bytes) - goto out_change_top; - - /* No more space from mempool item, need malloc new one */ - new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN); - if (unlikely(!new_mem)) { - bch2_trans_unlock(trans); - - new_mem = kmalloc(new_bytes, GFP_KERNEL); - if (!new_mem) - return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); + if (trans->used_mempool || new_bytes > BTREE_TRANS_MEM_MAX) { + EBUG_ON(trans->mem_bytes >= new_bytes); + return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); + } - ret = bch2_trans_relock(trans); - if (ret) { - kfree(new_mem); - return ERR_PTR(ret); - } - } - memcpy(new_mem, trans->mem, trans->mem_top); - trans->used_mempool = false; - mempool_free(trans->mem, &c->btree_trans_mem_pool); - goto out_new_mem; + if (old_bytes) { + trans->realloc_bytes_required = new_bytes; + trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); + return ERR_PTR(btree_trans_restart_ip(trans, + BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); } - new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); + EBUG_ON(trans->mem); + + new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN); if (unlikely(!new_mem)) { bch2_trans_unlock(trans); - new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); + new_mem = kmalloc(new_bytes, GFP_KERNEL); if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); new_bytes = BTREE_TRANS_MEM_MAX; - memcpy(new_mem, trans->mem, trans->mem_top); trans->used_mempool = true; - kfree(trans->mem); } - if (!new_mem) - return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); + EBUG_ON(!new_mem); trans->mem = new_mem; trans->mem_bytes = new_bytes; @@ -3207,18 +3259,10 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long if (ret) return ERR_PTR(ret); } -out_new_mem: + trans->mem = new_mem; trans->mem_bytes = new_bytes; - if (old_bytes) { - trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); - return ERR_PTR(btree_trans_restart_ip(trans, - BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); - } -out_change_top: - bch2_trans_kmalloc_trace(trans, size, ip); - p = trans->mem + trans->mem_top; trans->mem_top += size; memset(p, 0, size); @@ -3279,6 +3323,27 @@ u32 bch2_trans_begin(struct btree_trans *trans) trans->restart_count++; trans->mem_top = 0; + if (trans->restarted == BCH_ERR_transaction_restart_mem_realloced) { + EBUG_ON(!trans->mem || !trans->mem_bytes); + unsigned new_bytes = trans->realloc_bytes_required; + void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); + if (unlikely(!new_mem)) { + bch2_trans_unlock(trans); + new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); + + EBUG_ON(new_bytes > BTREE_TRANS_MEM_MAX); + + if (!new_mem) { + new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); + new_bytes = BTREE_TRANS_MEM_MAX; + trans->used_mempool = true; + kfree(trans->mem); + } + } + trans->mem = new_mem; + trans->mem_bytes = new_bytes; + } + trans_for_each_path(trans, path, i) { path->should_be_locked = false; @@ -3568,13 +3633,12 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out, struct btree_bkey_cached_common *b) { struct six_lock_count c = six_lock_counts(&b->lock); - struct task_struct *owner; pid_t pid; - rcu_read_lock(); - owner = READ_ONCE(b->lock.owner); - pid = owner ? owner->pid : 0; - rcu_read_unlock(); + scoped_guard(rcu) { + struct task_struct *owner = READ_ONCE(b->lock.owner); + pid = owner ? owner->pid : 0; + } prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b'); bch2_btree_id_to_text(out, b->btree_id); @@ -3603,7 +3667,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn); /* trans->paths is rcu protected vs. freeing */ - rcu_read_lock(); + guard(rcu)(); out->atomic++; struct btree_path *paths = rcu_dereference(trans->paths); @@ -3646,7 +3710,6 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) } out: --out->atomic; - rcu_read_unlock(); } void bch2_fs_btree_iter_exit(struct bch_fs *c) diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 2cabb5f0f484..09dd3e52622e 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -963,16 +963,6 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *, _p; \ }) -#define bch2_trans_run(_c, _do) \ -({ \ - struct btree_trans *trans = bch2_trans_get(_c); \ - int _ret = (_do); \ - bch2_trans_put(trans); \ - _ret; \ -}) - -#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do)) - struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned); void bch2_trans_put(struct btree_trans *); @@ -990,6 +980,27 @@ unsigned bch2_trans_get_fn_idx(const char *); __bch2_trans_get(_c, trans_fn_idx); \ }) +/* + * We don't use DEFINE_CLASS() because using a function for the constructor + * breaks bch2_trans_get()'s use of __func__ + */ +typedef struct btree_trans * class_btree_trans_t; +static inline void class_btree_trans_destructor(struct btree_trans **p) +{ + struct btree_trans *trans = *p; + bch2_trans_put(trans); +} + +#define class_btree_trans_constructor(_c) bch2_trans_get(_c) + +#define bch2_trans_run(_c, _do) \ +({ \ + CLASS(btree_trans, trans)(_c); \ + (_do); \ +}) + +#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do)) + void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *); void bch2_fs_btree_iter_exit(struct bch_fs *); diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index ade3b5addd75..ea839560a136 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -137,12 +137,15 @@ struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id b struct journal_key *k; BUG_ON(*idx > keys->nr); + + if (!keys->nr) + return NULL; search: if (!*idx) *idx = __bch2_journal_key_search(keys, btree_id, level, pos); - while (*idx && - __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { + while (*idx < keys->nr && + __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx)) >= 0) { (*idx)++; iters++; if (iters == 10) { @@ -151,18 +154,23 @@ search: } } + if (*idx == keys->nr) + --(*idx); + struct bkey_i *ret = NULL; rcu_read_lock(); /* for overwritten_ranges */ - while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { + while (true) { + k = idx_to_key(keys, *idx); if (__journal_key_cmp(btree_id, level, end_pos, k) > 0) break; if (k->overwritten) { if (k->overwritten_range) - *idx = rcu_dereference(k->overwritten_range)->start - 1; - else - *idx -= 1; + *idx = rcu_dereference(k->overwritten_range)->start; + if (!*idx) + break; + --(*idx); continue; } @@ -171,6 +179,8 @@ search: break; } + if (!*idx) + break; --(*idx); iters++; if (iters == 10) { @@ -292,7 +302,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, if (!new_keys.data) { bch_err(c, "%s: error allocating new key array (size %zu)", __func__, new_keys.size); - return -BCH_ERR_ENOMEM_journal_key_insert; + return bch_err_throw(c, ENOMEM_journal_key_insert); } /* Since @keys was full, there was no gap: */ @@ -331,7 +341,7 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); if (!n) - return -BCH_ERR_ENOMEM_journal_key_insert; + return bch_err_throw(c, ENOMEM_journal_key_insert); bkey_copy(n, k); ret = bch2_journal_key_insert_take(c, id, level, n); @@ -457,11 +467,9 @@ static void bch2_journal_iter_advance(struct journal_iter *iter) static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) { - struct bkey_s_c ret = bkey_s_c_null; - journal_iter_verify(iter); - rcu_read_lock(); + guard(rcu)(); while (iter->idx < iter->keys->size) { struct journal_key *k = iter->keys->data + iter->idx; @@ -470,19 +478,16 @@ static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) break; BUG_ON(cmp); - if (!k->overwritten) { - ret = bkey_i_to_s_c(k->k); - break; - } + if (!k->overwritten) + return bkey_i_to_s_c(k->k); if (k->overwritten_range) iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end); else bch2_journal_iter_advance(iter); } - rcu_read_unlock(); - return ret; + return bkey_s_c_null; } static void bch2_journal_iter_exit(struct journal_iter *iter) @@ -646,10 +651,11 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) { const struct journal_key *l = _l; const struct journal_key *r = _r; + int rewind = l->rewind && r->rewind ? -1 : 1; return journal_key_cmp(l, r) ?: - cmp_int(l->journal_seq, r->journal_seq) ?: - cmp_int(l->journal_offset, r->journal_offset); + ((cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->journal_offset, r->journal_offset)) * rewind); } void bch2_journal_keys_put(struct bch_fs *c) @@ -718,6 +724,8 @@ int bch2_journal_keys_sort(struct bch_fs *c) struct journal_keys *keys = &c->journal_keys; size_t nr_read = 0; + u64 rewind_seq = c->opts.journal_rewind ?: U64_MAX; + genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; @@ -726,28 +734,43 @@ int bch2_journal_keys_sort(struct bch_fs *c) cond_resched(); - for_each_jset_key(k, entry, &i->j) { - struct journal_key n = (struct journal_key) { - .btree_id = entry->btree_id, - .level = entry->level, - .k = k, - .journal_seq = le64_to_cpu(i->j.seq), - .journal_offset = k->_data - i->j._data, - }; - - if (darray_push(keys, n)) { - __journal_keys_sort(keys); - - if (keys->nr * 8 > keys->size * 7) { - bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", - keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); - return -BCH_ERR_ENOMEM_journal_keys_sort; + vstruct_for_each(&i->j, entry) { + bool rewind = !entry->level && + !btree_id_is_alloc(entry->btree_id) && + le64_to_cpu(i->j.seq) >= rewind_seq; + + if (entry->type != (rewind + ? BCH_JSET_ENTRY_overwrite + : BCH_JSET_ENTRY_btree_keys)) + continue; + + if (!rewind && le64_to_cpu(i->j.seq) < c->journal_replay_seq_start) + continue; + + jset_entry_for_each_key(entry, k) { + struct journal_key n = (struct journal_key) { + .btree_id = entry->btree_id, + .level = entry->level, + .rewind = rewind, + .k = k, + .journal_seq = le64_to_cpu(i->j.seq), + .journal_offset = k->_data - i->j._data, + }; + + if (darray_push(keys, n)) { + __journal_keys_sort(keys); + + if (keys->nr * 8 > keys->size * 7) { + bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", + keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); + return bch_err_throw(c, ENOMEM_journal_keys_sort); + } + + BUG_ON(darray_push(keys, n)); } - BUG_ON(darray_push(keys, n)); + nr_read++; } - - nr_read++; } } diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h index 8b773823704f..86aacb254fb2 100644 --- a/fs/bcachefs/btree_journal_iter_types.h +++ b/fs/bcachefs/btree_journal_iter_types.h @@ -11,8 +11,9 @@ struct journal_key { u32 journal_offset; enum btree_id btree_id:8; unsigned level:8; - bool allocated; - bool overwritten; + bool allocated:1; + bool overwritten:1; + bool rewind:1; struct journal_key_range_overwritten __rcu * overwritten_range; struct bkey_i *k; diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 9da950e7eb7d..d96188b92db2 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -187,27 +187,23 @@ lock: static struct bkey_cached * bkey_cached_reuse(struct btree_key_cache *c) { - struct bucket_table *tbl; + + guard(rcu)(); + struct bucket_table *tbl = rht_dereference_rcu(c->table.tbl, &c->table); struct rhash_head *pos; struct bkey_cached *ck; - unsigned i; - rcu_read_lock(); - tbl = rht_dereference_rcu(c->table.tbl, &c->table); - for (i = 0; i < tbl->size; i++) + for (unsigned i = 0; i < tbl->size; i++) rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && bkey_cached_lock_for_evict(ck)) { if (bkey_cached_evict(c, ck)) - goto out; + return ck; six_unlock_write(&ck->c.lock); six_unlock_intent(&ck->c.lock); } } - ck = NULL; -out: - rcu_read_unlock(); - return ck; + return NULL; } static int btree_key_cache_create(struct btree_trans *trans, @@ -242,7 +238,7 @@ static int btree_key_cache_create(struct btree_trans *trans, if (unlikely(!ck)) { bch_err(c, "error allocating memory for key cache item, btree %s", bch2_btree_id_str(ck_path->btree_id)); - return -BCH_ERR_ENOMEM_btree_key_cache_create; + return bch_err_throw(c, ENOMEM_btree_key_cache_create); } } @@ -260,7 +256,7 @@ static int btree_key_cache_create(struct btree_trans *trans, if (unlikely(!new_k)) { bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", bch2_btree_id_str(ck->key.btree_id), key_u64s); - ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; + ret = bch_err_throw(c, ENOMEM_btree_key_cache_fill); } else if (ret) { kfree(new_k); goto err; @@ -826,20 +822,20 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) bc->nr_pending = alloc_percpu(size_t); if (!bc->nr_pending) - return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return bch_err_throw(c, ENOMEM_fs_btree_cache_init); if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) || rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free)) - return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return bch_err_throw(c, ENOMEM_fs_btree_cache_init); if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) - return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return bch_err_throw(c, ENOMEM_fs_btree_cache_init); bc->table_init_done = true; shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name); if (!shrink) - return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return bch_err_throw(c, ENOMEM_fs_btree_cache_init); bc->shrink = shrink; shrink->count_objects = bch2_btree_key_cache_count; shrink->scan_objects = bch2_btree_key_cache_scan; diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 2f2aed0c9916..bed2b4b6ffb9 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -194,6 +194,30 @@ static int btree_trans_abort_preference(struct btree_trans *trans) return 3; } +static noinline __noreturn void break_cycle_fail(struct lock_graph *g) +{ + struct printbuf buf = PRINTBUF; + buf.atomic++; + + prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks")); + + for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++) { + struct btree_trans *trans = i->trans; + + bch2_btree_trans_to_text(&buf, trans); + + prt_printf(&buf, "backtrace:\n"); + printbuf_indent_add(&buf, 2); + bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + } + + bch2_print_str(g->g->trans->c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + BUG(); +} + static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle, struct trans_waiting_for_lock *from) { @@ -219,28 +243,8 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle, } } - if (unlikely(!best)) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - - prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks")); - - for (i = g->g; i < g->g + g->nr; i++) { - struct btree_trans *trans = i->trans; - - bch2_btree_trans_to_text(&buf, trans); - - prt_printf(&buf, "backtrace:\n"); - printbuf_indent_add(&buf, 2); - bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); - printbuf_indent_sub(&buf, 2); - prt_newline(&buf); - } - - bch2_print_str_nonblocking(g->g->trans->c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - BUG(); - } + if (unlikely(!best)) + break_cycle_fail(g); ret = abort_lock(g, abort); out: @@ -255,15 +259,14 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, struct printbuf *cycle) { struct btree_trans *orig_trans = g->g->trans; - struct trans_waiting_for_lock *i; - for (i = g->g; i < g->g + g->nr; i++) + for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++) if (i->trans == trans) { closure_put(&trans->ref); return break_cycle(g, cycle, i); } - if (g->nr == ARRAY_SIZE(g->g)) { + if (unlikely(g->nr == ARRAY_SIZE(g->g))) { closure_put(&trans->ref); if (orig_trans->lock_may_not_fail) @@ -308,7 +311,7 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) lock_graph_down(&g, trans); /* trans->paths is rcu protected vs. freeing */ - rcu_read_lock(); + guard(rcu)(); if (cycle) cycle->atomic++; next: @@ -406,7 +409,6 @@ up: out: if (cycle) --cycle->atomic; - rcu_read_unlock(); return ret; } @@ -769,7 +771,7 @@ static inline void __bch2_trans_unlock(struct btree_trans *trans) } static noinline __cold void bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path, - struct get_locks_fail *f, bool trace) + struct get_locks_fail *f, bool trace, ulong ip) { if (!trace) goto out; @@ -794,7 +796,7 @@ static noinline __cold void bch2_trans_relock_fail(struct btree_trans *trans, st prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); } - trace_trans_restart_relock(trans, _RET_IP_, buf.buf); + trace_trans_restart_relock(trans, ip, buf.buf); printbuf_exit(&buf); } @@ -804,7 +806,7 @@ out: bch2_trans_verify_locks(trans); } -static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) +static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace, ulong ip) { bch2_trans_verify_locks(trans); @@ -823,7 +825,7 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) if (path->should_be_locked && (ret = btree_path_get_locks(trans, path, false, &f, BCH_ERR_transaction_restart_relock))) { - bch2_trans_relock_fail(trans, path, &f, trace); + bch2_trans_relock_fail(trans, path, &f, trace, ip); return ret; } } @@ -836,12 +838,12 @@ out: int bch2_trans_relock(struct btree_trans *trans) { - return __bch2_trans_relock(trans, true); + return __bch2_trans_relock(trans, true, _RET_IP_); } int bch2_trans_relock_notrace(struct btree_trans *trans) { - return __bch2_trans_relock(trans, false); + return __bch2_trans_relock(trans, false, _RET_IP_); } void bch2_trans_unlock(struct btree_trans *trans) diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 9adca77e2580..f2173a3316f4 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -417,8 +417,10 @@ static inline void btree_path_set_should_be_locked(struct btree_trans *trans, st EBUG_ON(!btree_node_locked(path, path->level)); EBUG_ON(path->uptodate); - path->should_be_locked = true; - trace_btree_path_should_be_locked(trans, path); + if (!path->should_be_locked) { + path->should_be_locked = true; + trace_btree_path_should_be_locked(trans, path); + } } static inline void __btree_path_set_level_up(struct btree_trans *trans, diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 5a97a6b8a757..a3fb07c60e25 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -75,39 +75,6 @@ static inline u64 bkey_journal_seq(struct bkey_s_c k) } } -static bool found_btree_node_is_readable(struct btree_trans *trans, - struct found_btree_node *f) -{ - struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; - - found_btree_node_to_key(&tmp.k, f); - - struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false); - bool ret = !IS_ERR_OR_NULL(b); - if (!ret) - return ret; - - f->sectors_written = b->written; - f->journal_seq = le64_to_cpu(b->data->keys.journal_seq); - - struct bkey_s_c k; - struct bkey unpacked; - struct btree_node_iter iter; - for_each_btree_node_key_unpack(b, k, &iter, &unpacked) - f->journal_seq = max(f->journal_seq, bkey_journal_seq(k)); - - six_unlock_read(&b->c.lock); - - /* - * We might update this node's range; if that happens, we need the node - * to be re-read so the read path can trim keys that are no longer in - * this node - */ - if (b != btree_node_root(trans->c, b)) - bch2_btree_node_evict(trans, &tmp.k); - return ret; -} - static int found_btree_node_cmp_cookie(const void *_l, const void *_r) { const struct found_btree_node *l = _l; @@ -159,17 +126,17 @@ static const struct min_heap_callbacks found_btree_node_heap_cbs = { }; static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, - struct bio *bio, struct btree_node *bn, u64 offset) + struct btree *b, struct bio *bio, u64 offset) { struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); + struct btree_node *bn = b->data; bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); bio->bi_iter.bi_sector = offset; - bch2_bio_map(bio, bn, PAGE_SIZE); + bch2_bio_map(bio, b->data, c->opts.block_size); u64 submit_time = local_clock(); submit_bio_wait(bio); - bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); if (bio->bi_status) { @@ -217,7 +184,28 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, }; rcu_read_unlock(); - if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) { + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); + bio->bi_iter.bi_sector = offset; + bch2_bio_map(bio, b->data, c->opts.btree_node_size); + + submit_time = local_clock(); + submit_bio_wait(bio); + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); + + found_btree_node_to_key(&b->key, &n); + + CLASS(printbuf, buf)(); + if (!bch2_btree_node_read_done(c, ca, b, NULL, &buf)) { + /* read_done will swap out b->data for another buffer */ + bn = b->data; + /* + * Grab journal_seq here because we want the max journal_seq of + * any bset; read_done sorts down to a single set and picks the + * max journal_seq + */ + n.journal_seq = le64_to_cpu(bn->keys.journal_seq), + n.sectors_written = b->written; + mutex_lock(&f->lock); if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) { bch_err(c, "try_read_btree_node() can't handle endian conversion"); @@ -237,12 +225,20 @@ static int read_btree_nodes_worker(void *p) struct find_btree_nodes_worker *w = p; struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes); struct bch_dev *ca = w->ca; - void *buf = (void *) __get_free_page(GFP_KERNEL); - struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL); unsigned long last_print = jiffies; + struct btree *b = NULL; + struct bio *bio = NULL; + + b = __bch2_btree_node_mem_alloc(c); + if (!b) { + bch_err(c, "read_btree_nodes_worker: error allocating buf"); + w->f->ret = -ENOMEM; + goto err; + } - if (!buf || !bio) { - bch_err(c, "read_btree_nodes_worker: error allocating bio/buf"); + bio = bio_alloc(NULL, buf_pages(b->data, c->opts.btree_node_size), 0, GFP_KERNEL); + if (!bio) { + bch_err(c, "read_btree_nodes_worker: error allocating bio"); w->f->ret = -ENOMEM; goto err; } @@ -266,11 +262,13 @@ static int read_btree_nodes_worker(void *p) !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c))) continue; - try_read_btree_node(w->f, ca, bio, buf, sector); + try_read_btree_node(w->f, ca, b, bio, sector); } err: + if (b) + __btree_node_data_free(b); + kfree(b); bio_put(bio); - free_page((unsigned long) buf); enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); closure_put(w->cl); kfree(w); @@ -363,6 +361,8 @@ static int handle_overwrites(struct bch_fs *c, min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); } } + + cond_resched(); } return 0; @@ -519,8 +519,12 @@ bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b) return false; } -bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) +int bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) { + int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + if (ret) + return ret; + struct found_btree_node search = { .btree_id = btree, .level = 0, diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h index 08687b209787..66e6f9ed19d0 100644 --- a/fs/bcachefs/btree_node_scan.h +++ b/fs/bcachefs/btree_node_scan.h @@ -4,7 +4,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *); bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *); -bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id); +int bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id); int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos); void bch2_find_btree_nodes_exit(struct find_btree_nodes *); diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 1c03c965d836..639ef75b3dbd 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -376,7 +376,7 @@ static inline int btree_key_can_insert(struct btree_trans *trans, struct btree *b, unsigned u64s) { if (!bch2_btree_node_insert_fits(b, u64s)) - return -BCH_ERR_btree_insert_btree_node_full; + return bch_err_throw(trans->c, btree_insert_btree_node_full); return 0; } @@ -394,9 +394,10 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); if (!new_k) { - bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", + struct bch_fs *c = trans->c; + bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", bch2_btree_id_str(path->btree_id), new_u64s); - return -BCH_ERR_ENOMEM_btree_key_cache_insert; + return bch_err_throw(c, ENOMEM_btree_key_cache_insert); } ret = bch2_trans_relock(trans) ?: @@ -432,7 +433,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags if (watermark < BCH_WATERMARK_reclaim && !test_bit(BKEY_CACHED_DIRTY, &ck->flags) && bch2_btree_key_cache_must_wait(c)) - return -BCH_ERR_btree_insert_need_journal_reclaim; + return bch_err_throw(c, btree_insert_need_journal_reclaim); /* * bch2_varint_decode can read past the end of the buffer by at most 7 @@ -594,12 +595,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, int ret = 0; bch2_trans_verify_not_unlocked_or_in_restart(trans); - +#if 0 + /* todo: bring back dynamic fault injection */ if (race_fault()) { trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject); } - +#endif /* * Check if the insert will fit in the leaf node with the write lock * held, otherwise another thread could write the node changing the @@ -756,6 +758,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, btree_trans_journal_entries_start(trans), trans->journal_entries.u64s); + EBUG_ON(trans->journal_res.u64s < trans->journal_entries.u64s); + trans->journal_res.offset += trans->journal_entries.u64s; trans->journal_res.u64s -= trans->journal_entries.u64s; @@ -894,7 +898,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, */ if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && watermark < BCH_WATERMARK_reclaim) { - ret = -BCH_ERR_journal_reclaim_would_deadlock; + ret = bch_err_throw(c, journal_reclaim_would_deadlock); goto out; } @@ -966,13 +970,26 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) for (struct jset_entry *i = btree_trans_journal_entries_start(trans); i != btree_trans_journal_entries_top(trans); - i = vstruct_next(i)) + i = vstruct_next(i)) { if (i->type == BCH_JSET_ENTRY_btree_keys || i->type == BCH_JSET_ENTRY_write_buffer_keys) { - int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->start); - if (ret) - return ret; + jset_entry_for_each_key(i, k) { + int ret = bch2_journal_key_insert(c, i->btree_id, i->level, k); + if (ret) + return ret; + } + } + + if (i->type == BCH_JSET_ENTRY_btree_root) { + guard(mutex)(&c->btree_root_lock); + + struct btree_root *r = bch2_btree_id_root(c, i->btree_id); + + bkey_copy(&r->key, i->start); + r->level = i->level; + r->alive = true; } + } for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); i != btree_trans_subbuf_top(trans, &trans->accounting); @@ -989,6 +1006,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) { struct btree_insert_entry *errored_at = NULL; struct bch_fs *c = trans->c; + unsigned journal_u64s = 0; int ret = 0; bch2_trans_verify_not_unlocked_or_in_restart(trans); @@ -1011,16 +1029,16 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) ret = do_bch2_trans_commit_to_journal_replay(trans); else - ret = -BCH_ERR_erofs_trans_commit; + ret = bch_err_throw(c, erofs_trans_commit); goto out_reset; } EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); - trans->journal_u64s = trans->journal_entries.u64s + jset_u64s(trans->accounting.u64s); + journal_u64s = jset_u64s(trans->accounting.u64s); trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); if (trans->journal_transaction_names) - trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); + journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); trans_for_each_update(trans, i) { struct btree_path *path = trans->paths + i->path; @@ -1040,11 +1058,11 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) continue; /* we're going to journal the key being updated: */ - trans->journal_u64s += jset_u64s(i->k->k.u64s); + journal_u64s += jset_u64s(i->k->k.u64s); /* and we're also going to log the overwrite: */ if (trans->journal_transaction_names) - trans->journal_u64s += jset_u64s(i->old_k.u64s); + journal_u64s += jset_u64s(i->old_k.u64s); } if (trans->extra_disk_res) { @@ -1062,6 +1080,8 @@ retry: memset(&trans->journal_res, 0, sizeof(trans->journal_res)); memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); + trans->journal_u64s = journal_u64s + trans->journal_entries.u64s; + ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_); /* make sure we didn't drop or screw up locks: */ @@ -1093,7 +1113,7 @@ err: * restart: */ if (flags & BCH_TRANS_COMMIT_no_journal_res) { - ret = -BCH_ERR_transaction_restart_nested; + ret = bch_err_throw(c, transaction_restart_nested); goto out; } diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 9d641bf9d2a2..112170fd9c8f 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -497,6 +497,7 @@ struct btree_trans { void *mem; unsigned mem_top; unsigned mem_bytes; + unsigned realloc_bytes_required; #ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE darray_trans_kmalloc_trace trans_kmalloc_trace; #endif @@ -555,6 +556,8 @@ struct btree_trans { unsigned journal_u64s; unsigned extra_disk_res; /* XXX kill */ + __BKEY_PADDED(btree_path_down, BKEY_BTREE_PTR_VAL_U64s_MAX); + #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif @@ -615,6 +618,9 @@ enum btree_write_type { x(dying) \ x(fake) \ x(need_rewrite) \ + x(need_rewrite_error) \ + x(need_rewrite_degraded) \ + x(need_rewrite_ptr_written_zero) \ x(never_write) \ x(pinned) @@ -639,6 +645,32 @@ static inline void clear_btree_node_ ## flag(struct btree *b) \ BTREE_FLAGS() #undef x +#define BTREE_NODE_REWRITE_REASON() \ + x(none) \ + x(unknown) \ + x(error) \ + x(degraded) \ + x(ptr_written_zero) + +enum btree_node_rewrite_reason { +#define x(n) BTREE_NODE_REWRITE_##n, + BTREE_NODE_REWRITE_REASON() +#undef x +}; + +static inline enum btree_node_rewrite_reason btree_node_rewrite_reason(struct btree *b) +{ + if (btree_node_need_rewrite_ptr_written_zero(b)) + return BTREE_NODE_REWRITE_ptr_written_zero; + if (btree_node_need_rewrite_degraded(b)) + return BTREE_NODE_REWRITE_degraded; + if (btree_node_need_rewrite_error(b)) + return BTREE_NODE_REWRITE_error; + if (btree_node_need_rewrite(b)) + return BTREE_NODE_REWRITE_unknown; + return BTREE_NODE_REWRITE_none; +} + static inline struct btree_write *btree_current_write(struct btree *b) { return b->writes + btree_node_write_idx(b); diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 5dac09c98026..ee657b9f4b96 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -123,65 +123,44 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans, } int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, - enum btree_id id, - struct bpos old_pos, - struct bpos new_pos) + enum btree_id btree, struct bpos pos, + snapshot_id_list *s) { - struct bch_fs *c = trans->c; - struct btree_iter old_iter, new_iter = {}; - struct bkey_s_c old_k, new_k; - snapshot_id_list s; - struct bkey_i *update; int ret = 0; - if (!bch2_snapshot_has_children(c, old_pos.snapshot)) - return 0; + darray_for_each(*s, id) { + pos.snapshot = *id; - darray_init(&s); - - bch2_trans_iter_init(trans, &old_iter, id, old_pos, - BTREE_ITER_not_extents| - BTREE_ITER_all_snapshots); - while ((old_k = bch2_btree_iter_prev(trans, &old_iter)).k && - !(ret = bkey_err(old_k)) && - bkey_eq(old_pos, old_k.k->p)) { - struct bpos whiteout_pos = - SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot); - - if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) || - snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot)) - continue; - - new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos, - BTREE_ITER_not_extents| - BTREE_ITER_intent); - ret = bkey_err(new_k); + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, pos, + BTREE_ITER_not_extents| + BTREE_ITER_intent); + ret = bkey_err(k); if (ret) break; - if (new_k.k->type == KEY_TYPE_deleted) { - update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); + if (k.k->type == KEY_TYPE_deleted) { + struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); ret = PTR_ERR_OR_ZERO(update); - if (ret) + if (ret) { + bch2_trans_iter_exit(trans, &iter); break; + } bkey_init(&update->k); - update->k.p = whiteout_pos; + update->k.p = pos; update->k.type = KEY_TYPE_whiteout; - ret = bch2_trans_update(trans, &new_iter, update, + ret = bch2_trans_update(trans, &iter, update, BTREE_UPDATE_internal_snapshot_node); } - bch2_trans_iter_exit(trans, &new_iter); + bch2_trans_iter_exit(trans, &iter); - ret = snapshot_list_add(c, &s, old_k.k->p.snapshot); if (ret) break; } - bch2_trans_iter_exit(trans, &new_iter); - bch2_trans_iter_exit(trans, &old_iter); - darray_exit(&s); + darray_exit(s); return ret; } @@ -570,20 +549,26 @@ void *__bch2_trans_subbuf_alloc(struct btree_trans *trans, unsigned u64s) { unsigned new_top = buf->u64s + u64s; - unsigned old_size = buf->size; + unsigned new_size = buf->size; - if (new_top > buf->size) - buf->size = roundup_pow_of_two(new_top); + BUG_ON(roundup_pow_of_two(new_top) > U16_MAX); - void *n = bch2_trans_kmalloc_nomemzero(trans, buf->size * sizeof(u64)); + if (new_top > new_size) + new_size = roundup_pow_of_two(new_top); + + void *n = bch2_trans_kmalloc_nomemzero(trans, new_size * sizeof(u64)); if (IS_ERR(n)) return n; + unsigned offset = (u64 *) n - (u64 *) trans->mem; + BUG_ON(offset > U16_MAX); + if (buf->u64s) memcpy(n, btree_trans_subbuf_base(trans, buf), - old_size * sizeof(u64)); + buf->size * sizeof(u64)); buf->base = (u64 *) n - (u64 *) trans->mem; + buf->size = new_size; void *p = btree_trans_subbuf_top(trans, buf); buf->u64s = new_top; @@ -608,7 +593,7 @@ int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, BUG_ON(k.k->type != KEY_TYPE_deleted); if (bkey_gt(k.k->p, end)) { - ret = -BCH_ERR_ENOSPC_btree_slot; + ret = bch_err_throw(trans->c, ENOSPC_btree_slot); goto err; } diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index f907eaa8b185..0b98ab959719 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -4,6 +4,7 @@ #include "btree_iter.h" #include "journal.h" +#include "snapshot.h" struct bch_fs; struct btree; @@ -74,7 +75,7 @@ static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans, } int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id, - struct bpos, struct bpos); + struct bpos, snapshot_id_list *); /* * For use when splitting extents in existing snapshots: @@ -88,11 +89,20 @@ static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans, struct bpos old_pos, struct bpos new_pos) { + BUG_ON(old_pos.snapshot != new_pos.snapshot); + if (!btree_type_has_snapshots(btree) || bkey_eq(old_pos, new_pos)) return 0; - return __bch2_insert_snapshot_whiteouts(trans, btree, old_pos, new_pos); + snapshot_id_list s; + int ret = bch2_get_snapshot_overwrites(trans, btree, old_pos, &s); + if (ret) + return ret; + + return s.nr + ? __bch2_insert_snapshot_whiteouts(trans, btree, new_pos, &s) + : 0; } int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *, @@ -160,8 +170,7 @@ bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *); -int bch2_btree_write_buffer_insert_err(struct btree_trans *, - enum btree_id, struct bkey_i *); +int bch2_btree_write_buffer_insert_err(struct bch_fs *, enum btree_id, struct bkey_i *); static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans, enum btree_id btree, @@ -172,7 +181,7 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); if (unlikely(!btree_type_uses_write_buffer(btree))) { - int ret = bch2_btree_write_buffer_insert_err(trans, btree, k); + int ret = bch2_btree_write_buffer_insert_err(trans->c, btree, k); dump_stack(); return ret; } diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 74e65714fecd..553059b33bfd 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -57,8 +57,6 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) struct bkey_buf prev; int ret = 0; - printbuf_indent_add_nextline(&buf, 2); - BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, b->data->min_key)); @@ -69,20 +67,23 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) if (b == btree_node_root(c, b)) { if (!bpos_eq(b->data->min_key, POS_MIN)) { - ret = __bch2_topology_error(c, &buf); - + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "btree root with incorrect min_key: "); bch2_bpos_to_text(&buf, b->data->min_key); - log_fsck_err(trans, btree_root_bad_min_key, - "btree root with incorrect min_key: %s", buf.buf); - goto out; + prt_newline(&buf); + + bch2_count_fsck_err(c, btree_root_bad_min_key, &buf); + goto err; } if (!bpos_eq(b->data->max_key, SPOS_MAX)) { - ret = __bch2_topology_error(c, &buf); + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "btree root with incorrect max_key: "); bch2_bpos_to_text(&buf, b->data->max_key); - log_fsck_err(trans, btree_root_bad_max_key, - "btree root with incorrect max_key: %s", buf.buf); - goto out; + prt_newline(&buf); + + bch2_count_fsck_err(c, btree_root_bad_max_key, &buf); + goto err; } } @@ -100,19 +101,15 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) : bpos_successor(prev.k->k.p); if (!bpos_eq(expected_min, bp.v->min_key)) { - ret = __bch2_topology_error(c, &buf); - - prt_str(&buf, "end of prev node doesn't match start of next node\nin "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, " node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, "end of prev node doesn't match start of next node"); prt_str(&buf, "\nprev "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); prt_str(&buf, "\nnext "); bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); - log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf); - goto out; + bch2_count_fsck_err(c, btree_node_topology_bad_min_key, &buf); + goto err; } bch2_bkey_buf_reassemble(&prev, c, k); @@ -120,32 +117,34 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) } if (bkey_deleted(&prev.k->k)) { - ret = __bch2_topology_error(c, &buf); - - prt_str(&buf, "empty interior node\nin "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, " node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - - log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf); - } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) { - ret = __bch2_topology_error(c, &buf); + prt_printf(&buf, "empty interior node\n"); + bch2_count_fsck_err(c, btree_node_topology_empty_interior_node, &buf); + goto err; + } - prt_str(&buf, "last child node doesn't end at end of parent node\nin "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, " node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_str(&buf, "\nlast key "); + if (!bpos_eq(prev.k->k.p, b->key.k.p)) { + prt_str(&buf, "last child node doesn't end at end of parent node\nchild: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); + prt_newline(&buf); - log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf); + bch2_count_fsck_err(c, btree_node_topology_bad_max_key, &buf); + goto err; } out: -fsck_err: bch2_btree_and_journal_iter_exit(&iter); bch2_bkey_buf_exit(&prev, c); printbuf_exit(&buf); return ret; +err: + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_char(&buf, ' '); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_newline(&buf); + + ret = __bch2_topology_error(c, &buf); + bch2_print_str(c, KERN_ERR, buf.buf); + BUG_ON(!ret); + goto out; } /* Calculate ideal packed bkey format for new btree nodes: */ @@ -685,12 +684,31 @@ static void btree_update_nodes_written(struct btree_update *as) /* * Wait for any in flight writes to finish before we free the old nodes - * on disk: + * on disk. But we haven't pinned those old nodes in the btree cache, + * they might have already been evicted. + * + * The update we're completing deleted references to those nodes from the + * btree, so we know if they've been evicted they can't be pulled back in. + * We just have to check if the nodes we have pointers to are still those + * old nodes, and haven't been reused. + * + * This can't be done locklessly because the data buffer might have been + * vmalloc allocated, and they're not RCU freed. We also need the + * __no_kmsan_checks annotation because even with the btree node read + * lock, nothing tells us that the data buffer has been initialized (if + * the btree node has been reused for a different node, and the data + * buffer swapped for a new data buffer). */ for (i = 0; i < as->nr_old_nodes; i++) { b = as->old_nodes[i]; - if (btree_node_seq_matches(b, as->old_nodes_seq[i])) + bch2_trans_begin(trans); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); + bool seq_matches = btree_node_seq_matches(b, as->old_nodes_seq[i]); + six_unlock_read(&b->c.lock); + bch2_trans_unlock_long(trans); + + if (seq_matches) wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, TASK_UNINTERRUPTIBLE); } @@ -1120,6 +1138,13 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * start_time); } +static const char * const btree_node_reawrite_reason_strs[] = { +#define x(n) #n, + BTREE_NODE_REWRITE_REASON() +#undef x + NULL, +}; + static struct btree_update * bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, unsigned level_start, bool split, @@ -1214,6 +1239,15 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, list_add_tail(&as->list, &c->btree_interior_update_list); mutex_unlock(&c->btree_interior_update_lock); + struct btree *b = btree_path_node(path, path->level); + as->node_start = b->data->min_key; + as->node_end = b->data->max_key; + as->node_needed_rewrite = btree_node_rewrite_reason(b); + as->node_written = b->written; + as->node_sectors = btree_buf_bytes(b) >> 9; + as->node_remaining = __bch2_btree_u64s_remaining(b, + btree_bkey_last(b, bset_tree_last(b))); + /* * We don't want to allocate if we're in an error state, that can cause * deadlock on emergency shutdown due to open buckets getting stuck in @@ -1245,7 +1279,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, if (bch2_err_matches(ret, ENOSPC) && (flags & BCH_TRANS_COMMIT_journal_reclaim) && watermark < BCH_WATERMARK_reclaim) { - ret = -BCH_ERR_journal_reclaim_would_deadlock; + ret = bch_err_throw(c, journal_reclaim_would_deadlock); goto err; } @@ -1253,10 +1287,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, do { ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, &cl); - + if (!bch2_err_matches(ret, BCH_ERR_operation_blocked)) + break; bch2_trans_unlock(trans); bch2_wait_on_allocator(c, &cl); - } while (bch2_err_matches(ret, BCH_ERR_operation_blocked)); + } while (1); } if (ret) { @@ -2090,6 +2125,9 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, if (ret) goto err; + as->node_start = prev->data->min_key; + as->node_end = next->data->max_key; + trace_and_count(c, btree_node_merge, trans, b); n = bch2_btree_node_alloc(as, trans, b->c.level); @@ -2178,7 +2216,7 @@ static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter, if (btree_iter_path(trans, iter)->l[b->c.level].b != b) { /* node has been freed: */ BUG_ON(!btree_node_dying(b)); - ret = -BCH_ERR_btree_node_dying; + ret = bch_err_throw(trans->c, btree_node_dying); goto err; } @@ -2256,9 +2294,9 @@ err: goto out; } -static int bch2_btree_node_rewrite_key(struct btree_trans *trans, - enum btree_id btree, unsigned level, - struct bkey_i *k, unsigned flags) +int bch2_btree_node_rewrite_key(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_i *k, unsigned flags) { struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, @@ -2330,9 +2368,8 @@ static void async_btree_node_rewrite_work(struct work_struct *work) int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans, a->btree_id, a->level, a->key.k, 0)); - if (ret != -ENOENT && - !bch2_err_matches(ret, EROFS) && - ret != -BCH_ERR_journal_shutdown) + if (!bch2_err_matches(ret, ENOENT) && + !bch2_err_matches(ret, EROFS)) bch_err_fn_ratelimited(c, ret); spin_lock(&c->btree_node_rewrites_lock); @@ -2663,9 +2700,19 @@ static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update prt_str(out, " "); bch2_btree_id_to_text(out, as->btree_id); - prt_printf(out, " l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", + prt_printf(out, " l=%u-%u ", as->update_level_start, - as->update_level_end, + as->update_level_end); + bch2_bpos_to_text(out, as->node_start); + prt_char(out, ' '); + bch2_bpos_to_text(out, as->node_end); + prt_printf(out, "\nwritten %u/%u u64s_remaining %u need_rewrite %s", + as->node_written, + as->node_sectors, + as->node_remaining, + btree_node_reawrite_reason_strs[as->node_needed_rewrite]); + + prt_printf(out, "\nmode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", bch2_btree_update_modes[as->mode], as->nodes_written, closure_nr_remaining(&as->cl), @@ -2792,16 +2839,16 @@ int bch2_fs_btree_interior_update_init(struct bch_fs *c) c->btree_interior_update_worker = alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8); if (!c->btree_interior_update_worker) - return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; + return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init); c->btree_node_rewrite_worker = alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND); if (!c->btree_node_rewrite_worker) - return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; + return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init); if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, sizeof(struct btree_update))) - return -BCH_ERR_ENOMEM_btree_interior_update_pool_init; + return bch_err_throw(c, ENOMEM_btree_interior_update_pool_init); return 0; } diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 7fe793788a79..ac04e45a8515 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -57,6 +57,13 @@ struct btree_update { unsigned took_gc_lock:1; enum btree_id btree_id; + struct bpos node_start; + struct bpos node_end; + enum btree_node_rewrite_reason node_needed_rewrite; + u16 node_written; + u16 node_sectors; + u16 node_remaining; + unsigned update_level_start; unsigned update_level_end; @@ -169,6 +176,9 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, struct btree *, unsigned, unsigned); +int bch2_btree_node_rewrite_key(struct btree_trans *, + enum btree_id, unsigned, + struct bkey_i *, unsigned); int bch2_btree_node_rewrite_pos(struct btree_trans *, enum btree_id, unsigned, struct bpos, unsigned, unsigned); diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index efb0c64d0aac..4b095235a0d2 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -267,10 +267,9 @@ out: BUG_ON(wb->sorted.size < wb->flushing.keys.nr); } -int bch2_btree_write_buffer_insert_err(struct btree_trans *trans, +int bch2_btree_write_buffer_insert_err(struct bch_fs *c, enum btree_id btree, struct bkey_i *k) { - struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; prt_printf(&buf, "attempting to do write buffer update on non wb btree="); @@ -332,7 +331,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; if (unlikely(!btree_type_uses_write_buffer(k->btree))) { - ret = bch2_btree_write_buffer_insert_err(trans, k->btree, &k->k); + ret = bch2_btree_write_buffer_insert_err(trans->c, k->btree, &k->k); goto err; } @@ -394,7 +393,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) bool accounting_accumulated = false; do { if (race_fault()) { - ret = -BCH_ERR_journal_reclaim_would_deadlock; + ret = bch_err_throw(c, journal_reclaim_would_deadlock); break; } @@ -633,7 +632,7 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) struct bch_fs *c = trans->c; if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer)) - return -BCH_ERR_erofs_no_writes; + return bch_err_throw(c, erofs_no_writes); int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans); enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); @@ -676,7 +675,10 @@ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, goto err; bch2_bkey_buf_copy(last_flushed, c, tmp.k); - ret = -BCH_ERR_transaction_restart_write_buffer_flush; + + /* can we avoid the unconditional restart? */ + trace_and_count(c, trans_restart_write_buffer_flush, trans, _RET_IP_); + ret = bch_err_throw(c, transaction_restart_write_buffer_flush); } err: bch2_bkey_buf_exit(&tmp, c); diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h index 05f56fd1eed0..c351d21aca0b 100644 --- a/fs/bcachefs/btree_write_buffer.h +++ b/fs/bcachefs/btree_write_buffer.h @@ -89,6 +89,12 @@ static inline int bch2_journal_key_to_wb(struct bch_fs *c, struct journal_keys_to_wb *dst, enum btree_id btree, struct bkey_i *k) { + if (unlikely(!btree_type_uses_write_buffer(btree))) { + int ret = bch2_btree_write_buffer_insert_err(c, btree, k); + dump_stack(); + return ret; + } + EBUG_ON(!dst->seq); return k->k.type == KEY_TYPE_accounting diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 09eb5a543ae4..f25903c10e8a 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -221,6 +221,20 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { if (!p.ptr.cached && data_type == BCH_DATA_btree) { + switch (g->data_type) { + case BCH_DATA_sb: + bch_err(c, "btree and superblock in the same bucket - cannot repair"); + ret = bch_err_throw(c, fsck_repair_unimplemented); + goto out; + case BCH_DATA_journal: + ret = bch2_dev_journal_bucket_delete(ca, PTR_BUCKET_NR(ca, &p.ptr)); + bch_err_msg(c, ret, "error deleting journal bucket %zu", + PTR_BUCKET_NR(ca, &p.ptr)); + if (ret) + goto out; + break; + } + g->data_type = data_type; g->stripe_sectors = 0; g->dirty_sectors = 0; @@ -270,6 +284,9 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, struct printbuf buf = PRINTBUF; int ret = 0; + /* We don't yet do btree key updates correctly for when we're RW */ + BUG_ON(test_bit(BCH_FS_rw, &c->flags)); + bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) { ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update); if (ret) @@ -277,20 +294,13 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, } if (do_update) { - if (flags & BTREE_TRIGGER_is_root) { - bch_err(c, "cannot update btree roots yet"); - ret = -EINVAL; - goto err; - } - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); ret = PTR_ERR_OR_ZERO(new); if (ret) goto err; - rcu_read_lock(); - bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev)); - rcu_read_unlock(); + scoped_guard(rcu) + bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev)); if (level) { /* @@ -299,14 +309,11 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, * sort it out: */ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - rcu_read_lock(); - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - struct bucket *g = PTR_GC_BUCKET(ca, ptr); - - ptr->gen = g->gen; - } - rcu_read_unlock(); + scoped_guard(rcu) + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + ptr->gen = PTR_GC_BUCKET(ca, ptr)->gen; + } } else { struct bkey_ptrs ptrs; union bch_extent_entry *entry; @@ -370,19 +377,41 @@ found: bch_info(c, "new key %s", buf.buf); } - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, - BTREE_ITER_intent|BTREE_ITER_all_snapshots); - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, new, - BTREE_UPDATE_internal_snapshot_node| - BTREE_TRIGGER_norun); - bch2_trans_iter_exit(trans, &iter); - if (ret) - goto err; + if (!(flags & BTREE_TRIGGER_is_root)) { + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, + BTREE_ITER_intent|BTREE_ITER_all_snapshots); + ret = bch2_btree_iter_traverse(trans, &iter) ?: + bch2_trans_update(trans, &iter, new, + BTREE_UPDATE_internal_snapshot_node| + BTREE_TRIGGER_norun); + bch2_trans_iter_exit(trans, &iter); + if (ret) + goto err; + + if (level) + bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); + } else { + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, + jset_u64s(new->k.u64s)); + ret = PTR_ERR_OR_ZERO(e); + if (ret) + goto err; + + journal_entry_set(e, + BCH_JSET_ENTRY_btree_root, + btree, level - 1, + new, new->k.u64s); - if (level) - bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); + /* + * no locking, we're single threaded and not rw yet, see + * the big assertino above that we repeat here: + */ + BUG_ON(test_bit(BCH_FS_rw, &c->flags)); + + struct btree *b = bch2_btree_id_root(c, btree)->b; + bkey_copy(&b->key, new); + } } err: printbuf_exit(&buf); @@ -406,7 +435,15 @@ static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf if (insert) { bch2_trans_updates_to_text(buf, trans); __bch2_inconsistent_error(c, buf); - ret = -BCH_ERR_bucket_ref_update; + /* + * If we're in recovery, run_explicit_recovery_pass might give + * us an error code for rewinding recovery + */ + if (!ret) + ret = bch_err_throw(c, bucket_ref_update); + } else { + /* Always ignore overwrite errors, so that deletion works */ + ret = 0; } if (print || insert) @@ -595,7 +632,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); if (unlikely(!ca)) { if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID) - ret = -BCH_ERR_trigger_pointer; + ret = bch_err_throw(c, trigger_pointer); goto err; } @@ -603,7 +640,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, if (!bucket_valid(ca, bucket.offset)) { if (insert) { bch2_dev_bucket_missing(ca, bucket.offset); - ret = -BCH_ERR_trigger_pointer; + ret = bch_err_throw(c, trigger_pointer); } goto err; } @@ -625,7 +662,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", p.ptr.dev, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = -BCH_ERR_trigger_pointer; + ret = bch_err_throw(c, trigger_pointer); goto err; } @@ -651,6 +688,8 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, s64 sectors, enum btree_iter_update_trigger_flags flags) { + struct bch_fs *c = trans->c; + if (flags & BTREE_TRIGGER_transactional) { struct btree_iter iter; struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter, @@ -668,7 +707,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, bch2_trans_inconsistent(trans, "stripe pointer doesn't match stripe %llu", (u64) p.ec.idx); - ret = -BCH_ERR_trigger_stripe_pointer; + ret = bch_err_throw(c, trigger_stripe_pointer); goto err; } @@ -688,13 +727,11 @@ err: } if (flags & BTREE_TRIGGER_gc) { - struct bch_fs *c = trans->c; - struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL); if (!m) { bch_err(c, "error allocating memory for gc_stripes, idx %llu", (u64) p.ec.idx); - return -BCH_ERR_ENOMEM_mark_stripe_ptr; + return bch_err_throw(c, ENOMEM_mark_stripe_ptr); } gc_stripe_lock(m); @@ -709,7 +746,7 @@ err: __bch2_inconsistent_error(c, &buf); bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); - return -BCH_ERR_trigger_stripe_pointer; + return bch_err_throw(c, trigger_stripe_pointer); } m->block_sectors[p.ec.block] += sectors; @@ -732,8 +769,7 @@ err: static int __trigger_extent(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c k, - enum btree_iter_update_trigger_flags flags, - s64 *replicas_sectors) + enum btree_iter_update_trigger_flags flags) { bool gc = flags & BTREE_TRIGGER_gc; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -744,6 +780,8 @@ static int __trigger_extent(struct btree_trans *trans, : BCH_DATA_user; int ret = 0; + s64 replicas_sectors = 0; + struct disk_accounting_pos acc_replicas_key; memset(&acc_replicas_key, 0, sizeof(acc_replicas_key)); acc_replicas_key.type = BCH_DISK_ACCOUNTING_replicas; @@ -770,7 +808,7 @@ static int __trigger_extent(struct btree_trans *trans, if (ret) return ret; } else if (!p.has_ec) { - *replicas_sectors += disk_sectors; + replicas_sectors += disk_sectors; replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev); } else { ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags); @@ -808,13 +846,13 @@ static int __trigger_extent(struct btree_trans *trans, } if (acc_replicas_key.replicas.nr_devs) { - ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc); if (ret) return ret; } if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) { - ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, snapshot, k.k->p.snapshot); + ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, snapshot, k.k->p.snapshot); if (ret) return ret; } @@ -830,7 +868,7 @@ static int __trigger_extent(struct btree_trans *trans, } if (level) { - ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, btree, btree_id); + ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, btree, btree_id); if (ret) return ret; } else { @@ -839,7 +877,7 @@ static int __trigger_extent(struct btree_trans *trans, s64 v[3] = { insert ? 1 : -1, insert ? k.k->size : -((s64) k.k->size), - *replicas_sectors, + replicas_sectors, }; ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode); if (ret) @@ -871,20 +909,16 @@ int bch2_trigger_extent(struct btree_trans *trans, return 0; if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { - s64 old_replicas_sectors = 0, new_replicas_sectors = 0; - if (old.k->type) { int ret = __trigger_extent(trans, btree, level, old, - flags & ~BTREE_TRIGGER_insert, - &old_replicas_sectors); + flags & ~BTREE_TRIGGER_insert); if (ret) return ret; } if (new.k->type) { int ret = __trigger_extent(trans, btree, level, new.s_c, - flags & ~BTREE_TRIGGER_overwrite, - &new_replicas_sectors); + flags & ~BTREE_TRIGGER_overwrite); if (ret) return ret; } @@ -971,15 +1005,16 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, bch2_data_type_str(type), bch2_data_type_str(type)); - bool print = bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf); + bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf); - bch2_run_explicit_recovery_pass(c, &buf, + ret = bch2_run_explicit_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_allocations, 0); - if (print) - bch2_print_str(c, KERN_ERR, buf.buf); + /* Always print, this is always fatal */ + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); - ret = -BCH_ERR_metadata_bucket_inconsistency; + if (!ret) + ret = bch_err_throw(c, metadata_bucket_inconsistency); goto err; } @@ -1032,7 +1067,7 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev * err_unlock: bucket_unlock(g); err: - return -BCH_ERR_metadata_bucket_inconsistency; + return bch_err_throw(c, metadata_bucket_inconsistency); } int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, @@ -1247,7 +1282,7 @@ recalculate: ret = 0; } else { atomic64_set(&c->sectors_available, sectors_available); - ret = -BCH_ERR_ENOSPC_disk_reservation; + ret = bch_err_throw(c, ENOSPC_disk_reservation); } mutex_unlock(&c->sectors_available_lock); @@ -1276,7 +1311,7 @@ int bch2_buckets_nouse_alloc(struct bch_fs *c) GFP_KERNEL|__GFP_ZERO); if (!ca->buckets_nouse) { bch2_dev_put(ca); - return -BCH_ERR_ENOMEM_buckets_nouse; + return bch_err_throw(c, ENOMEM_buckets_nouse); } } @@ -1301,12 +1336,12 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) lockdep_assert_held(&c->state_lock); if (resize && ca->buckets_nouse) - return -BCH_ERR_no_resize_with_buckets_nouse; + return bch_err_throw(c, no_resize_with_buckets_nouse); bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets), GFP_KERNEL|__GFP_ZERO); if (!bucket_gens) { - ret = -BCH_ERR_ENOMEM_bucket_gens; + ret = bch_err_throw(c, ENOMEM_bucket_gens); goto err; } @@ -1325,9 +1360,9 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) sizeof(bucket_gens->b[0]) * copy); } - ret = bch2_bucket_bitmap_resize(&ca->bucket_backpointer_mismatch, + ret = bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_mismatch, ca->mi.nbuckets, nbuckets) ?: - bch2_bucket_bitmap_resize(&ca->bucket_backpointer_empty, + bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_empty, ca->mi.nbuckets, nbuckets); rcu_assign_pointer(ca->bucket_gens, bucket_gens); @@ -1354,7 +1389,7 @@ int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) { ca->usage = alloc_percpu(struct bch_dev_usage_full); if (!ca->usage) - return -BCH_ERR_ENOMEM_usage_init; + return bch_err_throw(c, ENOMEM_usage_init); return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets); } diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index af1532de4a37..49a3807a5eab 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -84,10 +84,8 @@ static inline int bucket_gen_get_rcu(struct bch_dev *ca, size_t b) static inline int bucket_gen_get(struct bch_dev *ca, size_t b) { - rcu_read_lock(); - int ret = bucket_gen_get_rcu(ca, b); - rcu_read_unlock(); - return ret; + guard(rcu)(); + return bucket_gen_get_rcu(ca, b); } static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, @@ -156,10 +154,8 @@ static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ */ static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { - rcu_read_lock(); - int ret = dev_ptr_stale_rcu(ca, ptr); - rcu_read_unlock(); - return ret; + guard(rcu)(); + return dev_ptr_stale_rcu(ca, ptr); } /* Device usage: */ diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c index c8a488e6b7b8..832eff93acb6 100644 --- a/fs/bcachefs/buckets_waiting_for_journal.c +++ b/fs/bcachefs/buckets_waiting_for_journal.c @@ -108,7 +108,8 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, realloc: n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL); if (!n) { - ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set; + struct bch_fs *c = container_of(b, struct bch_fs, buckets_waiting_for_journal); + ret = bch_err_throw(c, ENOMEM_buckets_waiting_for_journal_set); goto out; } diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 4066946b26bc..5ea89aa2b0c4 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -319,6 +319,7 @@ static int bch2_data_thread(void *arg) ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done; ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done; } + enumerated_ref_put(&ctx->c->writes, BCH_WRITE_REF_ioctl_data); return 0; } @@ -378,15 +379,24 @@ static long bch2_ioctl_data(struct bch_fs *c, struct bch_data_ctx *ctx; int ret; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_ioctl_data)) + return -EROFS; - if (arg.op >= BCH_DATA_OP_NR || arg.flags) - return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto put_ref; + } + + if (arg.op >= BCH_DATA_OP_NR || arg.flags) { + ret = -EINVAL; + goto put_ref; + } ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) - return -ENOMEM; + if (!ctx) { + ret = -ENOMEM; + goto put_ref; + } ctx->c = c; ctx->arg = arg; @@ -395,11 +405,16 @@ static long bch2_ioctl_data(struct bch_fs *c, &bcachefs_data_ops, bch2_data_thread); if (ret < 0) - kfree(ctx); + goto cleanup; + return ret; +cleanup: + kfree(ctx); +put_ref: + enumerated_ref_put(&c->writes, BCH_WRITE_REF_ioctl_data); return ret; } -static long bch2_ioctl_fs_usage(struct bch_fs *c, +static noinline_for_stack long bch2_ioctl_fs_usage(struct bch_fs *c, struct bch_ioctl_fs_usage __user *user_arg) { struct bch_ioctl_fs_usage arg = {}; @@ -469,7 +484,7 @@ err: } /* obsolete, didn't allow for new data types: */ -static long bch2_ioctl_dev_usage(struct bch_fs *c, +static noinline_for_stack long bch2_ioctl_dev_usage(struct bch_fs *c, struct bch_ioctl_dev_usage __user *user_arg) { struct bch_ioctl_dev_usage arg; @@ -613,15 +628,12 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c, if (!dev) return -EINVAL; - rcu_read_lock(); + guard(rcu)(); for_each_online_member_rcu(c, ca) - if (ca->dev == dev) { - rcu_read_unlock(); + if (ca->dev == dev) return ca->dev_idx; - } - rcu_read_unlock(); - return -BCH_ERR_ENOENT_dev_idx_not_found; + return bch_err_throw(c, ENOENT_dev_idx_not_found); } static long bch2_ioctl_disk_resize(struct bch_fs *c, diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index d3e2e4f776c6..a6795e73f0b9 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -173,7 +173,7 @@ int bch2_encrypt(struct bch_fs *c, unsigned type, if (bch2_fs_inconsistent_on(!c->chacha20_key_set, c, "attempting to encrypt without encryption key")) - return -BCH_ERR_no_encryption_key; + return bch_err_throw(c, no_encryption_key); bch2_chacha20(&c->chacha20_key, nonce, data, len); return 0; @@ -262,7 +262,7 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, if (bch2_fs_inconsistent_on(!c->chacha20_key_set, c, "attempting to encrypt without encryption key")) - return -BCH_ERR_no_encryption_key; + return bch_err_throw(c, no_encryption_key); bch2_chacha20_init(&chacha_state, &c->chacha20_key, nonce); @@ -375,7 +375,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, prt_str(&buf, ")"); WARN_RATELIMIT(1, "%s", buf.buf); printbuf_exit(&buf); - return -BCH_ERR_recompute_checksum; + return bch_err_throw(c, recompute_checksum); } for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { @@ -659,7 +659,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) crypt = bch2_sb_field_resize(&c->disk_sb, crypt, sizeof(*crypt) / sizeof(u64)); if (!crypt) { - ret = -BCH_ERR_ENOSPC_sb_crypt; + ret = bch_err_throw(c, ENOSPC_sb_crypt); goto err; } diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index f57f9f4774e6..8e9264b5a84e 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -53,7 +53,6 @@ void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) struct io_clock_wait { struct io_timer io_timer; - struct timer_list cpu_timer; struct task_struct *task; int expired; }; @@ -67,15 +66,6 @@ static void io_clock_wait_fn(struct io_timer *timer) wake_up_process(wait->task); } -static void io_clock_cpu_timeout(struct timer_list *timer) -{ - struct io_clock_wait *wait = container_of(timer, - struct io_clock_wait, cpu_timer); - - wait->expired = 1; - wake_up_process(wait->task); -} - void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until) { struct io_clock_wait wait = { @@ -90,8 +80,8 @@ void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until) bch2_io_timer_del(clock, &wait.io_timer); } -void bch2_kthread_io_clock_wait(struct io_clock *clock, - u64 io_until, unsigned long cpu_timeout) +unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *clock, + u64 io_until, unsigned long cpu_timeout) { bool kthread = (current->flags & PF_KTHREAD) != 0; struct io_clock_wait wait = { @@ -103,27 +93,26 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, bch2_io_timer_add(clock, &wait.io_timer); - timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0); - - if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) - mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); - - do { - set_current_state(TASK_INTERRUPTIBLE); - if (kthread && kthread_should_stop()) - break; - - if (wait.expired) - break; - - schedule(); + set_current_state(TASK_INTERRUPTIBLE); + if (!(kthread && kthread_should_stop())) { + cpu_timeout = schedule_timeout(cpu_timeout); try_to_freeze(); - } while (0); + } __set_current_state(TASK_RUNNING); - timer_delete_sync(&wait.cpu_timer); - timer_destroy_on_stack(&wait.cpu_timer); bch2_io_timer_del(clock, &wait.io_timer); + return cpu_timeout; +} + +void bch2_kthread_io_clock_wait(struct io_clock *clock, + u64 io_until, unsigned long cpu_timeout) +{ + bool kthread = (current->flags & PF_KTHREAD) != 0; + + while (!(kthread && kthread_should_stop()) && + cpu_timeout && + atomic64_read(&clock->now) < io_until) + cpu_timeout = bch2_kthread_io_clock_wait_once(clock, io_until, cpu_timeout); } static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now) diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h index 82c79c8baf92..8769be2aa21e 100644 --- a/fs/bcachefs/clock.h +++ b/fs/bcachefs/clock.h @@ -4,6 +4,7 @@ void bch2_io_timer_add(struct io_clock *, struct io_timer *); void bch2_io_timer_del(struct io_clock *, struct io_timer *); +unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *, u64, unsigned long); void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long); void __bch2_increment_clock(struct io_clock *, u64); diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 1bca61d17092..b37b1f325f0a 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -187,7 +187,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, __bch2_compression_types[crc.compression_type])) ret = bch2_check_set_has_compressed_data(c, opt); else - ret = -BCH_ERR_compression_workspace_not_initialized; + ret = bch_err_throw(c, compression_workspace_not_initialized); if (ret) goto err; } @@ -200,7 +200,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data, src_len, dst_len, dst_len); if (ret2 != dst_len) - ret = -BCH_ERR_decompress_lz4; + ret = bch_err_throw(c, decompress_lz4); break; case BCH_COMPRESSION_TYPE_gzip: { z_stream strm = { @@ -219,7 +219,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, mempool_free(workspace, workspace_pool); if (ret2 != Z_STREAM_END) - ret = -BCH_ERR_decompress_gzip; + ret = bch_err_throw(c, decompress_gzip); break; } case BCH_COMPRESSION_TYPE_zstd: { @@ -227,7 +227,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, size_t real_src_len = le32_to_cpup(src_data.b); if (real_src_len > src_len - 4) { - ret = -BCH_ERR_decompress_zstd_src_len_bad; + ret = bch_err_throw(c, decompress_zstd_src_len_bad); goto err; } @@ -241,7 +241,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, mempool_free(workspace, workspace_pool); if (ret2 != dst_len) - ret = -BCH_ERR_decompress_zstd; + ret = bch_err_throw(c, decompress_zstd); break; } default: @@ -270,7 +270,7 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op, bch2_write_op_error(op, op->pos.offset, "extent too big to decompress (%u > %u)", crc->uncompressed_size << 9, c->opts.encoded_extent_max); - return -BCH_ERR_decompress_exceeded_max_encoded_extent; + return bch_err_throw(c, decompress_exceeded_max_encoded_extent); } data = __bounce_alloc(c, dst_len, WRITE); @@ -314,7 +314,7 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || crc.compressed_size << 9 > c->opts.encoded_extent_max) - return -BCH_ERR_decompress_exceeded_max_encoded_extent; + return bch_err_throw(c, decompress_exceeded_max_encoded_extent); dst_data = dst_len == dst_iter.bi_size ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) @@ -656,12 +656,12 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) if (!mempool_initialized(&c->compression_bounce[READ]) && mempool_init_kvmalloc_pool(&c->compression_bounce[READ], 1, c->opts.encoded_extent_max)) - return -BCH_ERR_ENOMEM_compression_bounce_read_init; + return bch_err_throw(c, ENOMEM_compression_bounce_read_init); if (!mempool_initialized(&c->compression_bounce[WRITE]) && mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE], 1, c->opts.encoded_extent_max)) - return -BCH_ERR_ENOMEM_compression_bounce_write_init; + return bch_err_throw(c, ENOMEM_compression_bounce_write_init); for (i = compression_types; i < compression_types + ARRAY_SIZE(compression_types); @@ -675,7 +675,7 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) if (mempool_init_kvmalloc_pool( &c->compress_workspace[i->type], 1, i->compress_workspace)) - return -BCH_ERR_ENOMEM_compression_workspace_init; + return bch_err_throw(c, ENOMEM_compression_workspace_init); } return 0; diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h index 50ec3decfe8c..4080ee99aadd 100644 --- a/fs/bcachefs/darray.h +++ b/fs/bcachefs/darray.h @@ -8,6 +8,7 @@ * Inspired by CCAN's darray */ +#include <linux/cleanup.h> #include <linux/slab.h> #define DARRAY_PREALLOCATED(_type, _nr) \ @@ -87,7 +88,23 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); #define darray_remove_item(_d, _pos) \ array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data) -#define __darray_for_each(_d, _i) \ +#define darray_find_p(_d, _i, cond) \ +({ \ + typeof((_d).data) _ret = NULL; \ + \ + darray_for_each(_d, _i) \ + if (cond) { \ + _ret = _i; \ + break; \ + } \ + _ret; \ +}) + +#define darray_find(_d, _item) darray_find_p(_d, _i, *_i == _item) + +/* Iteration: */ + +#define __darray_for_each(_d, _i) \ for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++) #define darray_for_each(_d, _i) \ @@ -96,6 +113,8 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); #define darray_for_each_reverse(_d, _i) \ for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i) +/* Init/exit */ + #define darray_init(_d) \ do { \ (_d)->nr = 0; \ @@ -111,4 +130,29 @@ do { \ darray_init(_d); \ } while (0) +#define DEFINE_DARRAY_CLASS(_type) \ +DEFINE_CLASS(_type, _type, darray_exit(&(_T)), (_type) {}, void) + +#define DEFINE_DARRAY(_type) \ +typedef DARRAY(_type) darray_##_type; \ +DEFINE_DARRAY_CLASS(darray_##_type) + +#define DEFINE_DARRAY_NAMED(_name, _type) \ +typedef DARRAY(_type) _name; \ +DEFINE_DARRAY_CLASS(_name) + +DEFINE_DARRAY_CLASS(darray_char); +DEFINE_DARRAY_CLASS(darray_str) +DEFINE_DARRAY_CLASS(darray_const_str) + +DEFINE_DARRAY_CLASS(darray_u8) +DEFINE_DARRAY_CLASS(darray_u16) +DEFINE_DARRAY_CLASS(darray_u32) +DEFINE_DARRAY_CLASS(darray_u64) + +DEFINE_DARRAY_CLASS(darray_s8) +DEFINE_DARRAY_CLASS(darray_s16) +DEFINE_DARRAY_CLASS(darray_s32) +DEFINE_DARRAY_CLASS(darray_s64) + #endif /* _BCACHEFS_DARRAY_H */ diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index c34e5b88ba9d..e848e210a9bf 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -66,37 +66,46 @@ static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k) } } -static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_s_c k) +static noinline_for_stack +bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs, + const struct bch_extent_ptr *start) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + if (!ctxt) { + bkey_for_each_ptr(ptrs, ptr) { + if (ptr == start) + break; + + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); + bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); + } + return false; + } - bkey_for_each_ptr(ptrs, ptr) { + __bkey_for_each_ptr(start, ptrs.end, ptr) { struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - if (ctxt) { - bool locked; - - move_ctxt_wait_event(ctxt, - (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) || - list_empty(&ctxt->ios)); + bool locked; + move_ctxt_wait_event(ctxt, + (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) || + list_empty(&ctxt->ios)); + if (!locked) + bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); + } + return true; +} - if (!locked) - bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); - } else { - if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) { - bkey_for_each_ptr(ptrs, ptr2) { - if (ptr2 == ptr) - break; +static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs) +{ + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - ca = bch2_dev_have_ref(c, ptr2->dev); - bucket = PTR_BUCKET_POS(ca, ptr2); - bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); - } - return false; - } - } + if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) + return __bkey_nocow_lock(c, ctxt, ptrs, ptr); } + return true; } @@ -240,13 +249,14 @@ static int data_update_invalid_bkey(struct data_update *m, bch2_bkey_val_to_text(&buf, c, k); prt_str(&buf, "\nnew: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + prt_newline(&buf); bch2_fs_emergency_read_only2(c, &buf); bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); - return -BCH_ERR_invalid_bkey; + return bch_err_throw(c, invalid_bkey); } static int __bch2_data_update_index_update(struct btree_trans *trans, @@ -367,21 +377,21 @@ restart_drop_conflicting_replicas: bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i)); /* Now, drop excess replicas: */ - rcu_read_lock(); + scoped_guard(rcu) { restart_drop_extra_replicas: - bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { - unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); + bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { + unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); - if (!p.ptr.cached && - durability - ptr_durability >= m->op.opts.data_replicas) { - durability -= ptr_durability; + if (!p.ptr.cached && + durability - ptr_durability >= m->op.opts.data_replicas) { + durability -= ptr_durability; - bch2_extent_ptr_set_cached(c, &m->op.opts, - bkey_i_to_s(insert), &entry->ptr); - goto restart_drop_extra_replicas; + bch2_extent_ptr_set_cached(c, &m->op.opts, + bkey_i_to_s(insert), &entry->ptr); + goto restart_drop_extra_replicas; + } } } - rcu_read_unlock(); /* Finally, add the pointers we just wrote: */ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) @@ -523,8 +533,9 @@ void bch2_data_update_exit(struct data_update *update) bch2_bkey_buf_exit(&update->k, c); } -static int bch2_update_unwritten_extent(struct btree_trans *trans, - struct data_update *update) +static noinline_for_stack +int bch2_update_unwritten_extent(struct btree_trans *trans, + struct data_update *update) { struct bch_fs *c = update->op.c; struct bkey_i_extent *e; @@ -716,18 +727,10 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } -int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, - struct bch_io_opts *io_opts) +static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, + struct bch_io_opts *io_opts, + unsigned buf_bytes) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - /* write path might have to decompress data: */ - unsigned buf_bytes = 0; - bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) - buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); - unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); @@ -751,11 +754,26 @@ int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, return 0; } +int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, + struct bch_io_opts *io_opts) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + /* write path might have to decompress data: */ + unsigned buf_bytes = 0; + bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) + buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); + + return __bch2_data_update_bios_init(m, c, io_opts, buf_bytes); +} + static int can_write_extent(struct bch_fs *c, struct data_update *m) { if ((m->op.flags & BCH_WRITE_alloc_nowait) && unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) - return -BCH_ERR_data_update_done_would_block; + return bch_err_throw(c, data_update_done_would_block); unsigned target = m->op.flags & BCH_WRITE_only_specified_devs ? m->op.target @@ -765,7 +783,8 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m) darray_for_each(m->op.devs_have, i) __clear_bit(*i, devs.d); - rcu_read_lock(); + guard(rcu)(); + unsigned nr_replicas = 0, i; for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { struct bch_dev *ca = bch2_dev_rcu_noerror(c, i); @@ -782,12 +801,11 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m) if (nr_replicas >= m->op.nr_replicas) break; } - rcu_read_unlock(); if (!nr_replicas) - return -BCH_ERR_data_update_done_no_rw_devs; + return bch_err_throw(c, data_update_done_no_rw_devs); if (nr_replicas < m->op.nr_replicas) - return -BCH_ERR_insufficient_devices; + return bch_err_throw(c, insufficient_devices); return 0; } @@ -802,19 +820,21 @@ int bch2_data_update_init(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned reserve_sectors = k.k->size * data_opts.extra_replicas; int ret = 0; - /* - * fs is corrupt we have a key for a snapshot node that doesn't exist, - * and we have to check for this because we go rw before repairing the - * snapshots table - just skip it, we can move it later. - */ - if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot))) - return -BCH_ERR_data_update_done_no_snapshot; + if (k.k->p.snapshot) { + ret = bch2_check_key_has_snapshot(trans, iter, k); + if (bch2_err_matches(ret, BCH_ERR_recovery_will_run)) { + /* Can't repair yet, waiting on other recovery passes */ + return bch_err_throw(c, data_update_done_no_snapshot); + } + if (ret < 0) + return ret; + if (ret) /* key was deleted */ + return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + bch_err_throw(c, data_update_done_no_snapshot); + ret = 0; + } bch2_bkey_buf_init(&m->k); bch2_bkey_buf_reassemble(&m->k, c, k); @@ -842,10 +862,17 @@ int bch2_data_update_init(struct btree_trans *trans, unsigned durability_have = 0, durability_removing = 0; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned reserve_sectors = k.k->size * data_opts.extra_replicas; + unsigned buf_bytes = 0; + bool unwritten = false; + unsigned ptr_bit = 1; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { if (!p.ptr.cached) { - rcu_read_lock(); + guard(rcu)(); if (ptr_bit & m->data_opts.rewrite_ptrs) { if (crc_is_compressed(p.crc)) reserve_sectors += k.k->size; @@ -856,7 +883,6 @@ int bch2_data_update_init(struct btree_trans *trans, bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); durability_have += bch2_extent_ptr_durability(c, &p); } - rcu_read_unlock(); } /* @@ -872,6 +898,9 @@ int bch2_data_update_init(struct btree_trans *trans, if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) m->op.incompressible = true; + buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); + unwritten |= p.ptr.unwritten; + ptr_bit <<= 1; } @@ -910,7 +939,7 @@ int bch2_data_update_init(struct btree_trans *trans, if (iter) ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts); if (!ret) - ret = -BCH_ERR_data_update_done_no_writes_needed; + ret = bch_err_throw(c, data_update_done_no_writes_needed); goto out_bkey_buf_exit; } @@ -941,23 +970,25 @@ int bch2_data_update_init(struct btree_trans *trans, } if (!bkey_get_dev_refs(c, k)) { - ret = -BCH_ERR_data_update_done_no_dev_refs; + ret = bch_err_throw(c, data_update_done_no_dev_refs); goto out_put_disk_res; } if (c->opts.nocow_enabled && - !bkey_nocow_lock(c, ctxt, k)) { - ret = -BCH_ERR_nocow_lock_blocked; + !bkey_nocow_lock(c, ctxt, ptrs)) { + ret = bch_err_throw(c, nocow_lock_blocked); goto out_put_dev_refs; } - if (bkey_extent_is_unwritten(k)) { + if (unwritten) { ret = bch2_update_unwritten_extent(trans, m) ?: - -BCH_ERR_data_update_done_unwritten; + bch_err_throw(c, data_update_done_unwritten); goto out_nocow_unlock; } - ret = bch2_data_update_bios_init(m, c, io_opts); + bch2_trans_unlock(trans); + + ret = __bch2_data_update_bios_init(m, c, io_opts, buf_bytes); if (ret) goto out_nocow_unlock; diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 4fa70634c90e..07c2a0f73cc2 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -153,8 +153,6 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) c->verify_data = __bch2_btree_node_mem_alloc(c); if (!c->verify_data) goto out; - - list_del_init(&c->verify_data->list); } BUG_ON(b->nsets != 1); @@ -492,6 +490,8 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * prt_printf(out, "journal pin %px:\t%llu\n", &b->writes[1].journal, b->writes[1].journal.seq); + prt_printf(out, "ob:\t%u\n", b->ob.nr); + printbuf_indent_sub(out, 2); } @@ -508,27 +508,27 @@ static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, i->ret = 0; do { - struct bucket_table *tbl; - struct rhash_head *pos; - struct btree *b; - ret = bch2_debugfs_flush_buf(i); if (ret) return ret; - rcu_read_lock(); i->buf.atomic++; - tbl = rht_dereference_rcu(c->btree_cache.table.tbl, - &c->btree_cache.table); - if (i->iter < tbl->size) { - rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash) - bch2_cached_btree_node_to_text(&i->buf, c, b); - i->iter++; - } else { - done = true; + scoped_guard(rcu) { + struct bucket_table *tbl = + rht_dereference_rcu(c->btree_cache.table.tbl, + &c->btree_cache.table); + if (i->iter < tbl->size) { + struct rhash_head *pos; + struct btree *b; + + rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash) + bch2_cached_btree_node_to_text(&i->buf, c, b); + i->iter++; + } else { + done = true; + } } --i->buf.atomic; - rcu_read_unlock(); } while (!done); if (i->buf.allocation_failure) @@ -584,6 +584,8 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, i->ubuf = buf; i->size = size; i->ret = 0; + + int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); restart: seqmutex_lock(&c->btree_trans_lock); list_sort(&c->btree_trans_list, list_ptr_order_cmp); @@ -597,6 +599,11 @@ restart: if (!closure_get_not_zero(&trans->ref)) continue; + if (!trans->srcu_held) { + closure_put(&trans->ref); + continue; + } + u32 seq = seqmutex_unlock(&c->btree_trans_lock); bch2_btree_trans_to_text(&i->buf, trans); @@ -618,6 +625,8 @@ restart: } seqmutex_unlock(&c->btree_trans_lock); unlocked: + srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); + if (i->buf.allocation_failure) ret = -ENOMEM; diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index d198001838f3..28875c5c86ad 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -13,12 +13,15 @@ #include <linux/dcache.h> +#ifdef CONFIG_UNICODE int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, const struct qstr *str, struct qstr *out_cf) { *out_cf = (struct qstr) QSTR_INIT(NULL, 0); -#ifdef CONFIG_UNICODE + if (!bch2_fs_casefold_enabled(trans->c)) + return -EOPNOTSUPP; + unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1); int ret = PTR_ERR_OR_ZERO(buf); if (ret) @@ -30,10 +33,8 @@ int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, *out_cf = (struct qstr) QSTR_INIT(buf, ret); return 0; -#else - return -EOPNOTSUPP; -#endif } +#endif static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) { @@ -231,70 +232,66 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type)); } -static struct bkey_i_dirent *dirent_alloc_key(struct btree_trans *trans, - subvol_inum dir, - u8 type, - int name_len, int cf_name_len, - u64 dst) +int bch2_dirent_init_name(struct bch_fs *c, + struct bkey_i_dirent *dirent, + const struct bch_hash_info *hash_info, + const struct qstr *name, + const struct qstr *cf_name) { - struct bkey_i_dirent *dirent; - unsigned u64s = BKEY_U64s + dirent_val_u64s(name_len, cf_name_len); + EBUG_ON(hash_info->cf_encoding == NULL && cf_name); + int cf_len = 0; - BUG_ON(u64s > U8_MAX); - - dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); - if (IS_ERR(dirent)) - return dirent; + if (name->len > BCH_NAME_MAX) + return -ENAMETOOLONG; - bkey_dirent_init(&dirent->k_i); - dirent->k.u64s = u64s; + dirent->v.d_casefold = hash_info->cf_encoding != NULL; - if (type != DT_SUBVOL) { - dirent->v.d_inum = cpu_to_le64(dst); + if (!dirent->v.d_casefold) { + memcpy(&dirent->v.d_name[0], name->name, name->len); + memset(&dirent->v.d_name[name->len], 0, + bkey_val_bytes(&dirent->k) - + offsetof(struct bch_dirent, d_name) - + name->len); } else { - dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol); - dirent->v.d_child_subvol = cpu_to_le32(dst); - } + if (!bch2_fs_casefold_enabled(c)) + return -EOPNOTSUPP; - dirent->v.d_type = type; - dirent->v.d_unused = 0; - dirent->v.d_casefold = cf_name_len ? 1 : 0; +#ifdef CONFIG_UNICODE + memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len); - return dirent; -} + char *cf_out = &dirent->v.d_cf_name_block.d_names[name->len]; -static void dirent_init_regular_name(struct bkey_i_dirent *dirent, - const struct qstr *name) -{ - EBUG_ON(dirent->v.d_casefold); + if (cf_name) { + cf_len = cf_name->len; - memcpy(&dirent->v.d_name[0], name->name, name->len); - memset(&dirent->v.d_name[name->len], 0, - bkey_val_bytes(&dirent->k) - - offsetof(struct bch_dirent, d_name) - - name->len); -} + memcpy(cf_out, cf_name->name, cf_name->len); + } else { + cf_len = utf8_casefold(hash_info->cf_encoding, name, + cf_out, + bkey_val_end(bkey_i_to_s(&dirent->k_i)) - (void *) cf_out); + if (cf_len <= 0) + return cf_len; + } -static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent, - const struct qstr *name, - const struct qstr *cf_name) -{ - EBUG_ON(!dirent->v.d_casefold); - EBUG_ON(!cf_name->len); - - dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len); - dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_name->len); - memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len); - memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len); - memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0, - bkey_val_bytes(&dirent->k) - - offsetof(struct bch_dirent, d_cf_name_block.d_names) - - name->len + cf_name->len); - - EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_name->len); + memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_len], 0, + bkey_val_bytes(&dirent->k) - + offsetof(struct bch_dirent, d_cf_name_block.d_names) - + name->len + cf_len); + + dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len); + dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_len); + + EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_len); +#endif + } + + unsigned u64s = dirent_val_u64s(name->len, cf_len); + BUG_ON(u64s > bkey_val_u64s(&dirent->k)); + set_bkey_val_u64s(&dirent->k, u64s); + return 0; } -static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, +struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *trans, const struct bch_hash_info *hash_info, subvol_inum dir, u8 type, @@ -302,31 +299,28 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, const struct qstr *cf_name, u64 dst) { - struct bkey_i_dirent *dirent; - struct qstr _cf_name; - - if (name->len > BCH_NAME_MAX) - return ERR_PTR(-ENAMETOOLONG); + struct bkey_i_dirent *dirent = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64)); + if (IS_ERR(dirent)) + return dirent; - if (hash_info->cf_encoding && !cf_name) { - int ret = bch2_casefold(trans, hash_info, name, &_cf_name); - if (ret) - return ERR_PTR(ret); + bkey_dirent_init(&dirent->k_i); + dirent->k.u64s = BKEY_U64s_MAX; - cf_name = &_cf_name; + if (type != DT_SUBVOL) { + dirent->v.d_inum = cpu_to_le64(dst); + } else { + dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol); + dirent->v.d_child_subvol = cpu_to_le32(dst); } - dirent = dirent_alloc_key(trans, dir, type, name->len, cf_name ? cf_name->len : 0, dst); - if (IS_ERR(dirent)) - return dirent; + dirent->v.d_type = type; + dirent->v.d_unused = 0; - if (cf_name) - dirent_init_casefolded_name(dirent, name, cf_name); - else - dirent_init_regular_name(dirent, name); + int ret = bch2_dirent_init_name(trans->c, dirent, hash_info, name, cf_name); + if (ret) + return ERR_PTR(ret); EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len); - return dirent; } @@ -341,7 +335,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum); + dirent = bch2_dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum); ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; @@ -365,7 +359,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum); + dirent = bch2_dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum); ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; @@ -402,8 +396,8 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, } int bch2_dirent_rename(struct btree_trans *trans, - subvol_inum src_dir, struct bch_hash_info *src_hash, u64 *src_dir_i_size, - subvol_inum dst_dir, struct bch_hash_info *dst_hash, u64 *dst_dir_i_size, + subvol_inum src_dir, struct bch_hash_info *src_hash, + subvol_inum dst_dir, struct bch_hash_info *dst_hash, const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset, const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, enum bch_rename_mode mode) @@ -470,8 +464,8 @@ int bch2_dirent_rename(struct btree_trans *trans, *src_offset = dst_iter.pos.offset; /* Create new dst key: */ - new_dst = dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name, - dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0); + new_dst = bch2_dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name, + dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0); ret = PTR_ERR_OR_ZERO(new_dst); if (ret) goto out; @@ -481,8 +475,8 @@ int bch2_dirent_rename(struct btree_trans *trans, /* Create new src key: */ if (mode == BCH_RENAME_EXCHANGE) { - new_src = dirent_create_key(trans, src_hash, src_dir, 0, src_name, - src_hash->cf_encoding ? &src_name_lookup : NULL, 0); + new_src = bch2_dirent_create_key(trans, src_hash, src_dir, 0, src_name, + src_hash->cf_encoding ? &src_name_lookup : NULL, 0); ret = PTR_ERR_OR_ZERO(new_src); if (ret) goto out; @@ -542,14 +536,6 @@ int bch2_dirent_rename(struct btree_trans *trans, new_src->v.d_type == DT_SUBVOL) new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol); - if (old_dst.k) - *dst_dir_i_size -= bkey_bytes(old_dst.k); - *src_dir_i_size -= bkey_bytes(old_src.k); - - if (mode == BCH_RENAME_EXCHANGE) - *src_dir_i_size += bkey_bytes(&new_src->k); - *dst_dir_i_size += bkey_bytes(&new_dst->k); - ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); if (ret) goto out; @@ -656,7 +642,7 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol) continue; - ret = -BCH_ERR_ENOTEMPTY_dir_not_empty; + ret = bch_err_throw(trans->c, ENOTEMPTY_dir_not_empty); break; } bch2_trans_iter_exit(trans, &iter); @@ -692,7 +678,9 @@ static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subv return !ret; } -int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) +int bch2_readdir(struct bch_fs *c, subvol_inum inum, + struct bch_hash_info *hash_info, + struct dir_context *ctx) { struct bkey_buf sk; bch2_bkey_buf_init(&sk); @@ -710,7 +698,11 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k); subvol_inum target; - int ret2 = bch2_dirent_read_target(trans, inum, dirent, &target); + + bool need_second_pass = false; + int ret2 = bch2_str_hash_check_key(trans, NULL, &bch2_dirent_hash_desc, + hash_info, &iter, k, &need_second_pass) ?: + bch2_dirent_read_target(trans, inum, dirent, &target); if (ret2 > 0) continue; @@ -740,7 +732,7 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, ret = bch2_inode_unpack(k, inode); goto found; } - ret = -BCH_ERR_ENOENT_inode; + ret = bch_err_throw(trans->c, ENOENT_inode); found: bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); bch2_trans_iter_exit(trans, &iter); diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index d3e7ae669575..0417608c18d5 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -23,8 +23,16 @@ struct bch_fs; struct bch_hash_info; struct bch_inode_info; +#ifdef CONFIG_UNICODE int bch2_casefold(struct btree_trans *, const struct bch_hash_info *, const struct qstr *, struct qstr *); +#else +static inline int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, + const struct qstr *str, struct qstr *out_cf) +{ + return -EOPNOTSUPP; +} +#endif static inline int bch2_maybe_casefold(struct btree_trans *trans, const struct bch_hash_info *info, @@ -38,7 +46,7 @@ static inline int bch2_maybe_casefold(struct btree_trans *trans, } } -struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d); +struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent); static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len) { @@ -59,6 +67,15 @@ static inline void dirent_copy_target(struct bkey_i_dirent *dst, dst->v.d_type = src.v->d_type; } +int bch2_dirent_init_name(struct bch_fs *, + struct bkey_i_dirent *, + const struct bch_hash_info *, + const struct qstr *, + const struct qstr *); +struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *, + const struct bch_hash_info *, subvol_inum, u8, + const struct qstr *, const struct qstr *, u64); + int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, @@ -80,8 +97,8 @@ enum bch_rename_mode { }; int bch2_dirent_rename(struct btree_trans *, - subvol_inum, struct bch_hash_info *, u64 *, - subvol_inum, struct bch_hash_info *, u64 *, + subvol_inum, struct bch_hash_info *, + subvol_inum, struct bch_hash_info *, const struct qstr *, subvol_inum *, u64 *, const struct qstr *, subvol_inum *, u64 *, enum bch_rename_mode); @@ -95,7 +112,7 @@ u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32); int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); -int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); +int bch2_readdir(struct bch_fs *, subvol_inum, struct bch_hash_info *, struct dir_context *); int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos); diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index b3840ff7c407..f7528cd69c73 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -390,7 +390,7 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun err: free_percpu(n.v[1]); free_percpu(n.v[0]); - return -BCH_ERR_ENOMEM_disk_accounting; + return bch_err_throw(c, ENOMEM_disk_accounting); } int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, @@ -401,7 +401,7 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, if (mode != BCH_ACCOUNTING_read && accounting_to_replicas(&r.e, a.k->p) && !bch2_replicas_marked_locked(c, &r.e)) - return -BCH_ERR_btree_insert_need_mark_replicas; + return bch_err_throw(c, btree_insert_need_mark_replicas); percpu_up_read(&c->mark_lock); percpu_down_write(&c->mark_lock); @@ -419,7 +419,7 @@ int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounti if (mode != BCH_ACCOUNTING_read && accounting_to_replicas(&r.e, a.k->p) && !bch2_replicas_marked_locked(c, &r.e)) - return -BCH_ERR_btree_insert_need_mark_replicas; + return bch_err_throw(c, btree_insert_need_mark_replicas); return __bch2_accounting_mem_insert(c, a); } @@ -559,7 +559,7 @@ int bch2_gc_accounting_start(struct bch_fs *c) sizeof(u64), GFP_KERNEL); if (!e->v[1]) { bch2_accounting_free_counters(acc, true); - ret = -BCH_ERR_ENOMEM_disk_accounting; + ret = bch_err_throw(c, ENOMEM_disk_accounting); break; } } @@ -618,7 +618,9 @@ int bch2_gc_accounting_done(struct bch_fs *c) for (unsigned j = 0; j < nr; j++) src_v[j] -= dst_v[j]; - if (fsck_err(trans, accounting_mismatch, "%s", buf.buf)) { + bch2_trans_unlock_long(trans); + + if (fsck_err(c, accounting_mismatch, "%s", buf.buf)) { percpu_up_write(&c->mark_lock); ret = commit_do(trans, NULL, NULL, 0, bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false)); @@ -737,7 +739,7 @@ invalid_device: bch2_disk_accounting_mod(trans, acc, v, nr, false)) ?: -BCH_ERR_remove_disk_accounting_entry; } else { - ret = -BCH_ERR_remove_disk_accounting_entry; + ret = bch_err_throw(c, remove_disk_accounting_entry); } goto fsck_err; } @@ -897,8 +899,8 @@ int bch2_accounting_read(struct bch_fs *c) case BCH_DISK_ACCOUNTING_replicas: fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]); break; - case BCH_DISK_ACCOUNTING_dev_data_type: - rcu_read_lock(); + case BCH_DISK_ACCOUNTING_dev_data_type: { + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev); if (ca) { struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; @@ -910,9 +912,9 @@ int bch2_accounting_read(struct bch_fs *c) k.dev_data_type.data_type == BCH_DATA_journal) usage->hidden += v[0] * ca->mi.bucket_size; } - rcu_read_unlock(); break; } + } } preempt_enable(); fsck_err: @@ -1006,19 +1008,18 @@ void bch2_verify_accounting_clean(struct bch_fs *c) case BCH_DISK_ACCOUNTING_replicas: fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]); break; - case BCH_DISK_ACCOUNTING_dev_data_type: { - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); - if (!ca) { - rcu_read_unlock(); - continue; + case BCH_DISK_ACCOUNTING_dev_data_type: + { + guard(rcu)(); /* scoped guard is a loop, and doesn't play nicely with continue */ + struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); + if (!ca) + continue; + + v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets); + v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors); + v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented); } - v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets); - v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors); - v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented); - rcu_read_unlock(); - if (memcmp(a.v->d, v, 3 * sizeof(u64))) { struct printbuf buf = PRINTBUF; @@ -1032,7 +1033,6 @@ void bch2_verify_accounting_clean(struct bch_fs *c) mismatch = true; } } - } 0; }))); diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index f6098e33ab30..d61abebf3e0b 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -174,17 +174,17 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, case BCH_DISK_ACCOUNTING_replicas: fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]); break; - case BCH_DISK_ACCOUNTING_dev_data_type: - rcu_read_lock(); + case BCH_DISK_ACCOUNTING_dev_data_type: { + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); if (ca) { this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]); this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]); this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]); } - rcu_read_unlock(); break; } + } } unsigned idx; diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index c20ecf5e5381..cde842ac1886 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -130,7 +130,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL); if (!cpu_g) - return -BCH_ERR_ENOMEM_disk_groups_to_cpu; + return bch_err_throw(c, ENOMEM_disk_groups_to_cpu); cpu_g->nr = nr_groups; @@ -170,36 +170,28 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) { struct target t = target_decode(target); - struct bch_devs_mask *devs; - rcu_read_lock(); + guard(rcu)(); switch (t.type) { case TARGET_NULL: - devs = NULL; - break; + return NULL; case TARGET_DEV: { struct bch_dev *ca = t.dev < c->sb.nr_devices ? rcu_dereference(c->devs[t.dev]) : NULL; - devs = ca ? &ca->self : NULL; - break; + return ca ? &ca->self : NULL; } case TARGET_GROUP: { struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); - devs = g && t.group < g->nr && !g->entries[t.group].deleted + return g && t.group < g->nr && !g->entries[t.group].deleted ? &g->entries[t.group].devs : NULL; - break; } default: BUG(); } - - rcu_read_unlock(); - - return devs; } bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) @@ -384,7 +376,7 @@ void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) bch2_printbuf_make_room(out, 4096); out->atomic++; - rcu_read_lock(); + guard(rcu)(); struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); for (unsigned i = 0; i < (g ? g->nr : 0); i++) { @@ -405,16 +397,14 @@ next: prt_newline(out); } - rcu_read_unlock(); out->atomic--; } void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) { out->atomic++; - rcu_read_lock(); + guard(rcu)(); __bch2_disk_path_to_text(out, rcu_dereference(c->disk_groups), v), - rcu_read_unlock(); --out->atomic; } @@ -535,13 +525,11 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) switch (t.type) { case TARGET_NULL: prt_printf(out, "none"); - break; + return; case TARGET_DEV: { - struct bch_dev *ca; - out->atomic++; - rcu_read_lock(); - ca = t.dev < c->sb.nr_devices + guard(rcu)(); + struct bch_dev *ca = t.dev < c->sb.nr_devices ? rcu_dereference(c->devs[t.dev]) : NULL; @@ -552,13 +540,12 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) else prt_printf(out, "invalid device %u", t.dev); - rcu_read_unlock(); out->atomic--; - break; + return; } case TARGET_GROUP: bch2_disk_path_to_text(out, c, t.group); - break; + return; default: BUG(); } diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index c581426e3894..543dbba9b14f 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -213,7 +213,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, a->dirty_sectors, a->stripe, s.k->p.offset, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; + ret = bch_err_throw(c, mark_stripe); goto err; } @@ -224,7 +224,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, a->dirty_sectors, a->cached_sectors, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; + ret = bch_err_throw(c, mark_stripe); goto err; } } else { @@ -234,7 +234,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, bucket.inode, bucket.offset, a->gen, a->stripe, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; + ret = bch_err_throw(c, mark_stripe); goto err; } @@ -244,7 +244,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, bch2_data_type_str(a->data_type), bch2_data_type_str(data_type), (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; + ret = bch_err_throw(c, mark_stripe); goto err; } @@ -256,7 +256,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, a->dirty_sectors, a->cached_sectors, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; + ret = bch_err_throw(c, mark_stripe); goto err; } } @@ -295,7 +295,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev); if (unlikely(!ca)) { if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite)) - ret = -BCH_ERR_mark_stripe; + ret = bch_err_throw(c, mark_stripe); goto err; } @@ -325,7 +325,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n%s", ptr->dev, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; + ret = bch_err_throw(c, mark_stripe); goto err; } @@ -428,7 +428,7 @@ int bch2_trigger_stripe(struct btree_trans *trans, gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); if (!gc) { bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx); - return -BCH_ERR_ENOMEM_mark_stripe; + return bch_err_throw(c, ENOMEM_mark_stripe); } /* @@ -536,7 +536,8 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) } /* XXX: this is a non-mempoolified memory allocation: */ -static int ec_stripe_buf_init(struct ec_stripe_buf *buf, +static int ec_stripe_buf_init(struct bch_fs *c, + struct ec_stripe_buf *buf, unsigned offset, unsigned size) { struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; @@ -564,7 +565,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf, return 0; err: ec_stripe_buf_exit(buf); - return -BCH_ERR_ENOMEM_stripe_buf; + return bch_err_throw(c, ENOMEM_stripe_buf); } /* Checksumming: */ @@ -840,7 +841,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, buf = kzalloc(sizeof(*buf), GFP_NOFS); if (!buf) - return -BCH_ERR_ENOMEM_ec_read_extent; + return bch_err_throw(c, ENOMEM_ec_read_extent); ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf)); if (ret) { @@ -861,7 +862,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, goto err; } - ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio)); + ret = ec_stripe_buf_init(c, buf, offset, bio_sectors(&rbio->bio)); if (ret) { msg = "-ENOMEM"; goto err; @@ -894,7 +895,7 @@ err: bch_err_ratelimited(c, "error doing reconstruct read: %s\n %s", msg, msgbuf.buf); printbuf_exit(&msgbuf); - ret = -BCH_ERR_stripe_reconstruct; + ret = bch_err_throw(c, stripe_reconstruct); goto out; } @@ -904,7 +905,7 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) { if (c->gc_pos.phase != GC_PHASE_not_running && !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) - return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; + return bch_err_throw(c, ENOMEM_ec_stripe_mem_alloc); return 0; } @@ -1129,7 +1130,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, bch2_fs_inconsistent(c, "%s", buf.buf); printbuf_exit(&buf); - return -BCH_ERR_erasure_coding_found_btree_node; + return bch_err_throw(c, erasure_coding_found_btree_node); } k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed); @@ -1195,7 +1196,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); if (!ca) - return -BCH_ERR_ENOENT_dev_not_found; + return bch_err_throw(c, ENOENT_dev_not_found); struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); @@ -1256,7 +1257,7 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c, struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE, BCH_DEV_WRITE_REF_ec_bucket_zero); if (!ca) { - s->err = -BCH_ERR_erofs_no_writes; + s->err = bch_err_throw(c, erofs_no_writes); return; } @@ -1320,7 +1321,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ec_do_recov(c, &s->existing_stripe)) { bch_err(c, "error creating stripe: error reading existing stripe"); - ret = -BCH_ERR_ec_block_read; + ret = bch_err_throw(c, ec_block_read); goto err; } @@ -1346,7 +1347,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ec_nr_failed(&s->new_stripe)) { bch_err(c, "error creating stripe: error writing redundancy buckets"); - ret = -BCH_ERR_ec_block_write; + ret = bch_err_throw(c, ec_block_write); goto err; } @@ -1578,26 +1579,26 @@ static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_str static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h) { struct bch_devs_mask devs = h->devs; + unsigned nr_devs, nr_devs_with_durability; - rcu_read_lock(); - h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label - ? group_to_target(h->disk_label - 1) - : 0); - unsigned nr_devs = dev_mask_nr(&h->devs); + scoped_guard(rcu) { + h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label + ? group_to_target(h->disk_label - 1) + : 0); + nr_devs = dev_mask_nr(&h->devs); - for_each_member_device_rcu(c, ca, &h->devs) - if (!ca->mi.durability) - __clear_bit(ca->dev_idx, h->devs.d); - unsigned nr_devs_with_durability = dev_mask_nr(&h->devs); + for_each_member_device_rcu(c, ca, &h->devs) + if (!ca->mi.durability) + __clear_bit(ca->dev_idx, h->devs.d); + nr_devs_with_durability = dev_mask_nr(&h->devs); - h->blocksize = pick_blocksize(c, &h->devs); + h->blocksize = pick_blocksize(c, &h->devs); - h->nr_active_devs = 0; - for_each_member_device_rcu(c, ca, &h->devs) - if (ca->mi.bucket_size == h->blocksize) - h->nr_active_devs++; - - rcu_read_unlock(); + h->nr_active_devs = 0; + for_each_member_device_rcu(c, ca, &h->devs) + if (ca->mi.bucket_size == h->blocksize) + h->nr_active_devs++; + } /* * If we only have redundancy + 1 devices, we're better off with just @@ -1865,7 +1866,7 @@ static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new s->nr_data = existing_v->nr_blocks - existing_v->nr_redundant; - int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors)); + int ret = ec_stripe_buf_init(c, &s->existing_stripe, 0, le16_to_cpu(existing_v->sectors)); if (ret) { bch2_stripe_close(c, s); return ret; @@ -1925,7 +1926,7 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri } bch2_trans_iter_exit(trans, &lru_iter); if (!ret) - ret = -BCH_ERR_stripe_alloc_blocked; + ret = bch_err_throw(c, stripe_alloc_blocked); if (ret == 1) ret = 0; if (ret) @@ -1966,7 +1967,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st continue; } - ret = -BCH_ERR_ENOSPC_stripe_create; + ret = bch_err_throw(c, ENOSPC_stripe_create); break; } @@ -2024,7 +2025,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, if (!h->s) { h->s = ec_new_stripe_alloc(c, h); if (!h->s) { - ret = -BCH_ERR_ENOMEM_ec_new_stripe_alloc; + ret = bch_err_throw(c, ENOMEM_ec_new_stripe_alloc); bch_err(c, "failed to allocate new stripe"); goto err; } @@ -2089,7 +2090,7 @@ alloc_existing: goto err; allocate_buf: - ret = ec_stripe_buf_init(&s->new_stripe, 0, h->blocksize); + ret = ec_stripe_buf_init(c, &s->new_stripe, 0, h->blocksize); if (ret) goto err; @@ -2115,6 +2116,7 @@ int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, if (k.k->type != KEY_TYPE_stripe) return 0; + struct bch_fs *c = trans->c; struct bkey_i_stripe *s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, stripe); int ret = PTR_ERR_OR_ZERO(s); @@ -2141,23 +2143,22 @@ int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, unsigned nr_good = 0; - rcu_read_lock(); - bkey_for_each_ptr(ptrs, ptr) { - if (ptr->dev == dev_idx) - ptr->dev = BCH_SB_MEMBER_INVALID; + scoped_guard(rcu) + bkey_for_each_ptr(ptrs, ptr) { + if (ptr->dev == dev_idx) + ptr->dev = BCH_SB_MEMBER_INVALID; - struct bch_dev *ca = bch2_dev_rcu(trans->c, ptr->dev); - nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed; - } - rcu_read_unlock(); + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed; + } if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED)) - return -BCH_ERR_remove_would_lose_data; + return bch_err_throw(c, remove_would_lose_data); unsigned nr_data = s->v.nr_blocks - s->v.nr_redundant; if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST)) - return -BCH_ERR_remove_would_lose_data; + return bch_err_throw(c, remove_would_lose_data); sectors = -sectors; @@ -2178,14 +2179,15 @@ static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, s return 0; if (a->stripe_sectors) { - bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data"); - return -BCH_ERR_invalidate_stripe_to_dev; + struct bch_fs *c = trans->c; + bch_err(c, "trying to invalidate device in stripe when bucket has stripe data"); + return bch_err_throw(c, invalidate_stripe_to_dev); } struct btree_iter iter; struct bkey_s_c_stripe s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe), - BTREE_ITER_slots, stripe); + BTREE_ITER_slots, stripe); int ret = bkey_err(s); if (ret) return ret; diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c index 43557bebd0f8..c39cf304c681 100644 --- a/fs/bcachefs/errcode.c +++ b/fs/bcachefs/errcode.c @@ -13,12 +13,13 @@ static const char * const bch2_errcode_strs[] = { NULL }; -static unsigned bch2_errcode_parents[] = { +static const unsigned bch2_errcode_parents[] = { #define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class, BCH_ERRCODES() #undef x }; +__attribute__((const)) const char *bch2_err_str(int err) { const char *errstr; @@ -36,6 +37,7 @@ const char *bch2_err_str(int err) return errstr ?: "(Invalid error)"; } +__attribute__((const)) bool __bch2_err_matches(int err, int class) { err = abs(err); diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 62843e772b2c..acc3b7b67704 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -137,7 +137,6 @@ x(BCH_ERR_transaction_restart, transaction_restart_relock) \ x(BCH_ERR_transaction_restart, transaction_restart_relock_path) \ x(BCH_ERR_transaction_restart, transaction_restart_relock_path_intent) \ - x(BCH_ERR_transaction_restart, transaction_restart_relock_after_fill) \ x(BCH_ERR_transaction_restart, transaction_restart_too_many_iters) \ x(BCH_ERR_transaction_restart, transaction_restart_lock_node_reused) \ x(BCH_ERR_transaction_restart, transaction_restart_fill_relock) \ @@ -148,11 +147,8 @@ x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\ x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\ x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \ - x(BCH_ERR_transaction_restart, transaction_restart_key_cache_upgrade) \ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_raced) \ - x(BCH_ERR_transaction_restart, transaction_restart_key_cache_realloced)\ - x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \ x(BCH_ERR_transaction_restart, transaction_restart_split_race) \ x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \ x(BCH_ERR_transaction_restart, transaction_restart_nested) \ @@ -182,9 +178,12 @@ x(BCH_ERR_fsck, fsck_errors_not_fixed) \ x(BCH_ERR_fsck, fsck_repair_unimplemented) \ x(BCH_ERR_fsck, fsck_repair_impossible) \ - x(EINVAL, restart_recovery) \ - x(EINVAL, cannot_rewind_recovery) \ + x(EINVAL, recovery_will_run) \ + x(BCH_ERR_recovery_will_run, restart_recovery) \ + x(BCH_ERR_recovery_will_run, cannot_rewind_recovery) \ + x(BCH_ERR_recovery_will_run, recovery_pass_will_run) \ x(0, data_update_done) \ + x(0, bkey_was_deleted) \ x(BCH_ERR_data_update_done, data_update_done_would_block) \ x(BCH_ERR_data_update_done, data_update_done_unwritten) \ x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \ @@ -211,6 +210,8 @@ x(EINVAL, remove_would_lose_data) \ x(EINVAL, no_resize_with_buckets_nouse) \ x(EINVAL, inode_unpack_error) \ + x(EINVAL, inode_not_unlinked) \ + x(EINVAL, inode_has_child_snapshot) \ x(EINVAL, varint_decode_error) \ x(EINVAL, erasure_coding_found_btree_node) \ x(EINVAL, option_negative) \ @@ -236,7 +237,6 @@ x(BCH_ERR_journal_res_blocked, journal_buf_enomem) \ x(BCH_ERR_journal_res_blocked, journal_stuck) \ x(BCH_ERR_journal_res_blocked, journal_retry_open) \ - x(BCH_ERR_journal_res_blocked, journal_preres_get_blocked) \ x(BCH_ERR_journal_res_blocked, bucket_alloc_blocked) \ x(BCH_ERR_journal_res_blocked, stripe_alloc_blocked) \ x(BCH_ERR_invalid, invalid_sb) \ @@ -282,7 +282,6 @@ x(EIO, sb_not_downgraded) \ x(EIO, btree_node_write_all_failed) \ x(EIO, btree_node_read_error) \ - x(EIO, btree_node_read_validate_error) \ x(EIO, btree_need_topology_repair) \ x(EIO, bucket_ref_update) \ x(EIO, trigger_alloc) \ @@ -357,9 +356,11 @@ enum bch_errcode { BCH_ERR_MAX }; -const char *bch2_err_str(int); -bool __bch2_err_matches(int, int); +__attribute__((const)) const char *bch2_err_str(int); +__attribute__((const)) bool __bch2_err_matches(int, int); + +__attribute__((const)) static inline bool _bch2_err_matches(int err, int class) { return err < 0 && __bch2_err_matches(err, class); diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index c2cad28635bf..267e73d9d7e6 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -69,7 +69,7 @@ static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *tra if (trans) bch2_trans_updates_to_text(&buf, trans); bool ret = __bch2_inconsistent_error(c, &buf); - bch2_print_str_nonblocking(c, KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); return ret; @@ -100,10 +100,10 @@ int __bch2_topology_error(struct bch_fs *c, struct printbuf *out) set_bit(BCH_FS_topology_error, &c->flags); if (!test_bit(BCH_FS_in_recovery, &c->flags)) { __bch2_inconsistent_error(c, out); - return -BCH_ERR_btree_need_topology_repair; + return bch_err_throw(c, btree_need_topology_repair); } else { return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?: - -BCH_ERR_btree_node_read_validate_error; + bch_err_throw(c, btree_need_topology_repair); } } @@ -403,23 +403,23 @@ int bch2_fsck_err_opt(struct bch_fs *c, if (test_bit(BCH_FS_in_fsck, &c->flags)) { if (!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) - return -BCH_ERR_fsck_repair_unimplemented; + return bch_err_throw(c, fsck_repair_unimplemented); switch (c->opts.fix_errors) { case FSCK_FIX_exit: - return -BCH_ERR_fsck_errors_not_fixed; + return bch_err_throw(c, fsck_errors_not_fixed); case FSCK_FIX_yes: if (flags & FSCK_CAN_FIX) - return -BCH_ERR_fsck_fix; + return bch_err_throw(c, fsck_fix); fallthrough; case FSCK_FIX_no: if (flags & FSCK_CAN_IGNORE) - return -BCH_ERR_fsck_ignore; - return -BCH_ERR_fsck_errors_not_fixed; + return bch_err_throw(c, fsck_ignore); + return bch_err_throw(c, fsck_errors_not_fixed); case FSCK_FIX_ask: if (flags & FSCK_AUTOFIX) - return -BCH_ERR_fsck_fix; - return -BCH_ERR_fsck_ask; + return bch_err_throw(c, fsck_fix); + return bch_err_throw(c, fsck_ask); default: BUG(); } @@ -427,12 +427,12 @@ int bch2_fsck_err_opt(struct bch_fs *c, if ((flags & FSCK_AUTOFIX) && (c->opts.errors == BCH_ON_ERROR_continue || c->opts.errors == BCH_ON_ERROR_fix_safe)) - return -BCH_ERR_fsck_fix; + return bch_err_throw(c, fsck_fix); if (c->opts.errors == BCH_ON_ERROR_continue && (flags & FSCK_CAN_IGNORE)) - return -BCH_ERR_fsck_ignore; - return -BCH_ERR_fsck_errors_not_fixed; + return bch_err_throw(c, fsck_ignore); + return bch_err_throw(c, fsck_errors_not_fixed); } } @@ -444,7 +444,7 @@ int __bch2_fsck_err(struct bch_fs *c, { va_list args; struct printbuf buf = PRINTBUF, *out = &buf; - int ret = -BCH_ERR_fsck_ignore; + int ret = 0; const char *action_orig = "fix?", *action = action_orig; might_sleep(); @@ -474,8 +474,8 @@ int __bch2_fsck_err(struct bch_fs *c, if (test_bit(err, c->sb.errors_silent)) return flags & FSCK_CAN_FIX - ? -BCH_ERR_fsck_fix - : -BCH_ERR_fsck_ignore; + ? bch_err_throw(c, fsck_fix) + : bch_err_throw(c, fsck_ignore); printbuf_indent_add_nextline(out, 2); @@ -517,10 +517,10 @@ int __bch2_fsck_err(struct bch_fs *c, prt_str(out, ", "); if (flags & FSCK_CAN_FIX) { prt_actioning(out, action); - ret = -BCH_ERR_fsck_fix; + ret = bch_err_throw(c, fsck_fix); } else { prt_str(out, ", continuing"); - ret = -BCH_ERR_fsck_ignore; + ret = bch_err_throw(c, fsck_ignore); } goto print; @@ -532,18 +532,18 @@ int __bch2_fsck_err(struct bch_fs *c, "run fsck, and forward to devs so error can be marked for self-healing"); inconsistent = true; print = true; - ret = -BCH_ERR_fsck_errors_not_fixed; + ret = bch_err_throw(c, fsck_errors_not_fixed); } else if (flags & FSCK_CAN_FIX) { prt_str(out, ", "); prt_actioning(out, action); - ret = -BCH_ERR_fsck_fix; + ret = bch_err_throw(c, fsck_fix); } else { prt_str(out, ", continuing"); - ret = -BCH_ERR_fsck_ignore; + ret = bch_err_throw(c, fsck_ignore); } } else if (c->opts.fix_errors == FSCK_FIX_exit) { prt_str(out, ", exiting"); - ret = -BCH_ERR_fsck_errors_not_fixed; + ret = bch_err_throw(c, fsck_errors_not_fixed); } else if (flags & FSCK_CAN_FIX) { int fix = s && s->fix ? s->fix @@ -562,30 +562,37 @@ int __bch2_fsck_err(struct bch_fs *c, : FSCK_FIX_yes; ret = ret & 1 - ? -BCH_ERR_fsck_fix - : -BCH_ERR_fsck_ignore; + ? bch_err_throw(c, fsck_fix) + : bch_err_throw(c, fsck_ignore); } else if (fix == FSCK_FIX_yes || (c->opts.nochanges && !(flags & FSCK_CAN_IGNORE))) { prt_str(out, ", "); prt_actioning(out, action); - ret = -BCH_ERR_fsck_fix; + ret = bch_err_throw(c, fsck_fix); } else { prt_str(out, ", not "); prt_actioning(out, action); + ret = bch_err_throw(c, fsck_ignore); + } + } else { + if (flags & FSCK_CAN_IGNORE) { + prt_str(out, ", continuing"); + ret = bch_err_throw(c, fsck_ignore); + } else { + prt_str(out, " (repair unimplemented)"); + ret = bch_err_throw(c, fsck_repair_unimplemented); } - } else if (!(flags & FSCK_CAN_IGNORE)) { - prt_str(out, " (repair unimplemented)"); } - if (ret == -BCH_ERR_fsck_ignore && + if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) && (c->opts.fix_errors == FSCK_FIX_exit || !(flags & FSCK_CAN_IGNORE))) - ret = -BCH_ERR_fsck_errors_not_fixed; + ret = bch_err_throw(c, fsck_errors_not_fixed); if (test_bit(BCH_FS_in_fsck, &c->flags) && - (ret != -BCH_ERR_fsck_fix && - ret != -BCH_ERR_fsck_ignore)) { + (!bch2_err_matches(ret, BCH_ERR_fsck_fix) && + !bch2_err_matches(ret, BCH_ERR_fsck_ignore))) { exiting = true; print = true; } @@ -614,25 +621,32 @@ print: if (s) s->ret = ret; + if (trans && + !(flags & FSCK_ERR_NO_LOG) && + ret == -BCH_ERR_fsck_fix) + ret = bch2_trans_log_str(trans, bch2_sb_error_strs[err]) ?: ret; +err_unlock: + mutex_unlock(&c->fsck_error_msgs_lock); +err: /* * We don't yet track whether the filesystem currently has errors, for * log_fsck_err()s: that would require us to track for every error type * which recovery pass corrects it, to get the fsck exit status correct: */ - if (flags & FSCK_CAN_FIX) { - if (ret == -BCH_ERR_fsck_fix) { - set_bit(BCH_FS_errors_fixed, &c->flags); - } else { - set_bit(BCH_FS_errors_not_fixed, &c->flags); - set_bit(BCH_FS_error, &c->flags); - } + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + /* nothing */ + } else if (bch2_err_matches(ret, BCH_ERR_fsck_fix)) { + set_bit(BCH_FS_errors_fixed, &c->flags); + } else { + set_bit(BCH_FS_errors_not_fixed, &c->flags); + set_bit(BCH_FS_error, &c->flags); } -err_unlock: - mutex_unlock(&c->fsck_error_msgs_lock); -err: + if (action != action_orig) kfree(action); printbuf_exit(&buf); + + BUG_ON(!ret); return ret; } @@ -650,12 +664,12 @@ int __bch2_bkey_fsck_err(struct bch_fs *c, const char *fmt, ...) { if (from.flags & BCH_VALIDATE_silent) - return -BCH_ERR_fsck_delete_bkey; + return bch_err_throw(c, fsck_delete_bkey); unsigned fsck_flags = 0; if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) { if (test_bit(err, c->sb.errors_silent)) - return -BCH_ERR_fsck_delete_bkey; + return bch_err_throw(c, fsck_delete_bkey); fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX; } diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 5123d4c86770..0c3c3a24fc6f 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -105,13 +105,13 @@ void bch2_free_fsck_errs(struct bch_fs *); #define fsck_err_wrap(_do) \ ({ \ int _ret = _do; \ - if (_ret != -BCH_ERR_fsck_fix && \ - _ret != -BCH_ERR_fsck_ignore) { \ + if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \ + !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) { \ ret = _ret; \ goto fsck_err; \ } \ \ - _ret == -BCH_ERR_fsck_fix; \ + bch2_err_matches(_ret, BCH_ERR_fsck_fix); \ }) #define __fsck_err(...) fsck_err_wrap(bch2_fsck_err(__VA_ARGS__)) @@ -170,10 +170,10 @@ do { \ int _ret = __bch2_bkey_fsck_err(c, k, from, \ BCH_FSCK_ERR_##_err_type, \ _err_msg, ##__VA_ARGS__); \ - if (_ret != -BCH_ERR_fsck_fix && \ - _ret != -BCH_ERR_fsck_ignore) \ + if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \ + !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) \ ret = _ret; \ - ret = -BCH_ERR_fsck_delete_bkey; \ + ret = bch_err_throw(c, fsck_delete_bkey); \ goto fsck_err; \ } while (0) diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index b899ee75f5b9..e76e58a568bf 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -139,6 +139,17 @@ int bch2_extent_trim_atomic(struct btree_trans *trans, if (ret) return ret; - bch2_cut_back(end, k); + /* tracepoint */ + + if (bpos_lt(end, k->k.p)) { + if (trace_extent_trim_atomic_enabled()) { + CLASS(printbuf, buf)(); + bch2_bpos_to_text(&buf, end); + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k)); + trace_extent_trim_atomic(trans->c, buf.buf); + } + bch2_cut_back(end, k); + } return 0; } diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 1ac9897f189d..83cbd77dcb9c 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -50,34 +50,34 @@ void bch2_io_failures_to_text(struct printbuf *out, struct bch_io_failures *failed) { static const char * const error_types[] = { - "io", "checksum", "ec reconstruct", NULL + "btree validate", "io", "checksum", "ec reconstruct", NULL }; for (struct bch_dev_io_failures *f = failed->devs; f < failed->devs + failed->nr; f++) { unsigned errflags = - ((!!f->failed_io) << 0) | - ((!!f->failed_csum_nr) << 1) | - ((!!f->failed_ec) << 2); - - if (!errflags) - continue; + ((!!f->failed_btree_validate) << 0) | + ((!!f->failed_io) << 1) | + ((!!f->failed_csum_nr) << 2) | + ((!!f->failed_ec) << 3); bch2_printbuf_make_room(out, 1024); - rcu_read_lock(); out->atomic++; - struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev); - if (ca) - prt_str(out, ca->name); - else - prt_printf(out, "(invalid device %u)", f->dev); + scoped_guard(rcu) { + struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev); + if (ca) + prt_str(out, ca->name); + else + prt_printf(out, "(invalid device %u)", f->dev); + } --out->atomic; - rcu_read_unlock(); prt_char(out, ' '); - if (is_power_of_2(errflags)) { + if (!errflags) { + prt_str(out, "no error - confused"); + } else if (is_power_of_2(errflags)) { prt_bitflags(out, error_types, errflags); prt_str(out, " error"); } else { @@ -193,7 +193,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, bool have_dirty_ptrs = false, have_pick = false; if (k.k->type == KEY_TYPE_error) - return -BCH_ERR_key_type_error; + return bch_err_throw(c, key_type_error); rcu_read_lock(); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -286,17 +286,17 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (!have_dirty_ptrs) return 0; if (have_missing_devs) - return -BCH_ERR_no_device_to_read_from; + return bch_err_throw(c, no_device_to_read_from); if (have_csum_errors) - return -BCH_ERR_data_read_csum_err; + return bch_err_throw(c, data_read_csum_err); if (have_io_errors) - return -BCH_ERR_data_read_io_err; + return bch_err_throw(c, data_read_io_err); /* * If we get here, we have pointers (bkey_ptrs_validate() ensures that), * but they don't point to valid devices: */ - return -BCH_ERR_no_devices_valid; + return bch_err_throw(c, no_devices_valid); } /* KEY_TYPE_btree_ptr: */ @@ -407,6 +407,8 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) lp.crc = bch2_extent_crc_unpack(l.k, NULL); rp.crc = bch2_extent_crc_unpack(r.k, NULL); + guard(rcu)(); + while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) && __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) { if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size != @@ -418,10 +420,8 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) return false; /* Extents may not straddle buckets: */ - rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev); bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr); - rcu_read_unlock(); if (!same_bucket) return false; @@ -838,11 +838,9 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) struct extent_ptr_decoded p; unsigned durability = 0; - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) durability += bch2_extent_ptr_durability(c, &p); - rcu_read_unlock(); - return durability; } @@ -853,12 +851,10 @@ static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k) struct extent_ptr_decoded p; unsigned durability = 0; - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev]) durability += bch2_extent_ptr_durability(c, &p); - rcu_read_unlock(); - return durability; } @@ -1015,20 +1011,16 @@ bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct bch_dev *ca; - bool ret = false; - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr(ptrs, ptr) if (bch2_dev_in_target(c, ptr->dev, target) && (ca = bch2_dev_rcu(c, ptr->dev)) && (!ptr->cached || - !dev_ptr_stale_rcu(ca, ptr))) { - ret = true; - break; - } - rcu_read_unlock(); + !dev_ptr_stale_rcu(ca, ptr))) + return true; - return ret; + return false; } bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, @@ -1142,7 +1134,7 @@ void bch2_extent_ptr_set_cached(struct bch_fs *c, bool have_cached_ptr; unsigned drop_dev = ptr->dev; - rcu_read_lock(); + guard(rcu)(); restart_drop_ptrs: ptrs = bch2_bkey_ptrs(k); have_cached_ptr = false; @@ -1175,10 +1167,8 @@ restart_drop_ptrs: goto drop; ptr->cached = true; - rcu_read_unlock(); return; drop: - rcu_read_unlock(); bch2_bkey_drop_ptr_noerror(k, ptr); } @@ -1194,12 +1184,11 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) { struct bch_dev *ca; - rcu_read_lock(); + guard(rcu)(); bch2_bkey_drop_ptrs(k, ptr, ptr->cached && (!(ca = bch2_dev_rcu(c, ptr->dev)) || dev_ptr_stale_rcu(ca, ptr) > 0)); - rcu_read_unlock(); return bkey_deleted(k.k); } @@ -1217,7 +1206,7 @@ bool bch2_extent_normalize_by_opts(struct bch_fs *c, struct bkey_ptrs ptrs; bool have_cached_ptr; - rcu_read_lock(); + guard(rcu)(); restart_drop_ptrs: ptrs = bch2_bkey_ptrs(k); have_cached_ptr = false; @@ -1230,7 +1219,6 @@ restart_drop_ptrs: } have_cached_ptr = true; } - rcu_read_unlock(); return bkey_deleted(k.k); } @@ -1238,7 +1226,7 @@ restart_drop_ptrs: void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr) { out->atomic++; - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); if (!ca) { prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, @@ -1262,7 +1250,6 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc else if (stale) prt_printf(out, " invalid"); } - rcu_read_unlock(); --out->atomic; } @@ -1528,7 +1515,7 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, struct bch_compression_opt opt = __bch2_compression_decode(r->compression); prt_printf(err, "invalid compression opt %u:%u", opt.type, opt.level); - return -BCH_ERR_invalid_bkey; + return bch_err_throw(c, invalid_bkey); } #endif break; diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index e3a75dcca60c..1c54b9b5bd69 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -394,17 +394,9 @@ struct bch_writepage_state { struct bch_io_opts opts; struct bch_folio_sector *tmp; unsigned tmp_sectors; + struct blk_plug plug; }; -static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, - struct bch_inode_info *inode) -{ - struct bch_writepage_state ret = { 0 }; - - bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode); - return ret; -} - /* * Determine when a writepage io is full. We have to limit writepage bios to a * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to @@ -666,23 +658,23 @@ do_io: int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct bch_fs *c = mapping->host->i_sb->s_fs_info; - struct bch_writepage_state w = - bch_writepage_state_init(c, to_bch_ei(mapping->host)); - struct blk_plug plug; - int ret; + struct bch_writepage_state *w = kzalloc(sizeof(*w), GFP_NOFS|__GFP_NOFAIL); - blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); - if (w.io) - bch2_writepage_do_io(&w); - blk_finish_plug(&plug); - kfree(w.tmp); + bch2_inode_opts_get(&w->opts, c, &to_bch_ei(mapping->host)->ei_inode); + + blk_start_plug(&w->plug); + int ret = write_cache_pages(mapping, wbc, __bch2_writepage, w); + if (w->io) + bch2_writepage_do_io(w); + blk_finish_plug(&w->plug); + kfree(w->tmp); + kfree(w); return bch2_err_class(ret); } /* buffered writes: */ -int bch2_write_begin(struct file *file, struct address_space *mapping, +int bch2_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -765,7 +757,7 @@ err_unlock: return bch2_err_class(ret); } -int bch2_write_end(struct file *file, struct address_space *mapping, +int bch2_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h index 3207ebbb4ab4..14de91c27656 100644 --- a/fs/bcachefs/fs-io-buffered.h +++ b/fs/bcachefs/fs-io-buffered.h @@ -10,9 +10,9 @@ int bch2_read_folio(struct file *, struct folio *); int bch2_writepages(struct address_space *, struct writeback_control *); void bch2_readahead(struct readahead_control *); -int bch2_write_begin(struct file *, struct address_space *, loff_t pos, +int bch2_write_begin(const struct kiocb *, struct address_space *, loff_t pos, unsigned len, struct folio **, void **); -int bch2_write_end(struct file *, struct address_space *, loff_t, +int bch2_write_end(const struct kiocb *, struct address_space *, loff_t, unsigned len, unsigned copied, struct folio *, void *); ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index fbae9c1de746..c2cc405822f2 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -447,7 +447,7 @@ static int __bch2_folio_reservation_get(struct bch_fs *c, if (!reserved) { bch2_disk_reservation_put(c, &disk_res); - return -BCH_ERR_ENOSPC_disk_reservation; + return bch_err_throw(c, ENOSPC_disk_reservation); } break; } diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index b1e9ee28fc0f..a233f45875e9 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -71,12 +71,12 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { - rcu_read_lock(); - ca = rcu_dereference(c->devs[dev]); - if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE], - BCH_DEV_WRITE_REF_nocow_flush)) - ca = NULL; - rcu_read_unlock(); + scoped_guard(rcu) { + ca = rcu_dereference(c->devs[dev]); + if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE], + BCH_DEV_WRITE_REF_nocow_flush)) + ca = NULL; + } if (!ca) continue; diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 05361a793206..4e72e654da96 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -268,13 +268,13 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, } if (dst_dentry->d_inode) { - error = -BCH_ERR_EEXIST_subvolume_create; + error = bch_err_throw(c, EEXIST_subvolume_create); goto err3; } dir = dst_path.dentry->d_inode; if (IS_DEADDIR(dir)) { - error = -BCH_ERR_ENOENT_directory_dead; + error = bch_err_throw(c, ENOENT_directory_dead); goto err3; } diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index ddfe89d84966..687af0eea0c2 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -124,8 +124,9 @@ retry: goto err; struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u); + bool rebalance_changed = memcmp(&old_r, &new_r, sizeof(new_r)); - if (memcmp(&old_r, &new_r, sizeof(new_r))) { + if (rebalance_changed) { ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum); if (ret) goto err; @@ -146,6 +147,9 @@ err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; + if (rebalance_changed) + bch2_rebalance_wakeup(c); + bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, "%s: inode %llu:%llu not found when updating", bch2_err_str(ret), @@ -718,7 +722,6 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, if (IS_ERR(inode)) inode = NULL; -#ifdef CONFIG_UNICODE if (!inode && IS_CASEFOLDED(vdir)) { /* * Do not cache a negative dentry in casefolded directories @@ -733,7 +736,6 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, */ return NULL; } -#endif return d_splice_alias(&inode->v, dentry); } @@ -1549,11 +1551,11 @@ static const struct vm_operations_struct bch_vm_ops = { .page_mkwrite = bch2_page_mkwrite, }; -static int bch2_mmap(struct file *file, struct vm_area_struct *vma) +static int bch2_mmap_prepare(struct vm_area_desc *desc) { - file_accessed(file); + file_accessed(desc->file); - vma->vm_ops = &bch_vm_ops; + desc->vm_ops = &bch_vm_ops; return 0; } @@ -1569,11 +1571,12 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); if (!dir_emit_dots(file, ctx)) return 0; - int ret = bch2_readdir(c, inode_inum(inode), ctx); + int ret = bch2_readdir(c, inode_inum(inode), &hash, ctx); bch_err_fn(c, ret); return bch2_err_class(ret); @@ -1614,7 +1617,7 @@ static const __maybe_unused unsigned bch_flags_to_xflags[] = { }; static int bch2_fileattr_get(struct dentry *dentry, - struct fileattr *fa) + struct file_kattr *fa) { struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -1677,7 +1680,7 @@ static int fssetxattr_inode_update_fn(struct btree_trans *trans, static int bch2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, - struct fileattr *fa) + struct file_kattr *fa) { struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -1727,7 +1730,8 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap, bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); - return ret; + + return bch2_err_class(ret); } static const struct file_operations bch_file_operations = { @@ -1735,7 +1739,7 @@ static const struct file_operations bch_file_operations = { .llseek = bch2_llseek, .read_iter = bch2_read_iter, .write_iter = bch2_write_iter, - .mmap = bch2_mmap, + .mmap_prepare = bch2_mmap_prepare, .get_unmapped_area = thp_get_unmapped_area, .fsync = bch2_fsync, .splice_read = filemap_splice_read, @@ -2002,14 +2006,14 @@ retry: goto err; if (k.k->type != KEY_TYPE_dirent) { - ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; + ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); goto err; } d = bkey_s_c_to_dirent(k); ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); if (ret > 0) - ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; + ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); if (ret) goto err; @@ -2175,7 +2179,13 @@ static void bch2_evict_inode(struct inode *vinode) KEY_TYPE_QUOTA_WARN); bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, KEY_TYPE_QUOTA_WARN); - bch2_inode_rm(c, inode_inum(inode)); + int ret = bch2_inode_rm(c, inode_inum(inode)); + if (ret && !bch2_err_matches(ret, EROFS)) { + bch_err_msg(c, ret, "VFS incorrectly tried to delete inode %llu:%llu", + inode->ei_inum.subvol, + inode->ei_inum.inum); + bch2_sb_error_count(c, BCH_FSCK_ERR_vfs_bad_inode_rm); + } /* * If we are deleting, we need it present in the vfs hash table @@ -2322,14 +2332,13 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root) struct bch_fs *c = root->d_sb->s_fs_info; bool first = true; - rcu_read_lock(); + guard(rcu)(); for_each_online_member_rcu(c, ca) { if (!first) seq_putc(seq, ':'); first = false; seq_puts(seq, ca->disk_sb.sb_name); } - rcu_read_unlock(); return 0; } @@ -2480,6 +2489,14 @@ static int bch2_fs_get_tree(struct fs_context *fc) if (ret) goto err_stop_fs; + /* + * We might be doing a RO mount because other options required it, or we + * have no alloc info and it's a small image with no room to regenerate + * it + */ + if (c->opts.read_only) + fc->sb_flags |= SB_RDONLY; + sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c); ret = PTR_ERR_OR_ZERO(sb); if (ret) @@ -2526,16 +2543,16 @@ got_sb: sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; - rcu_read_lock(); - for_each_online_member_rcu(c, ca) { - struct block_device *bdev = ca->disk_sb.bdev; + scoped_guard(rcu) { + for_each_online_member_rcu(c, ca) { + struct block_device *bdev = ca->disk_sb.bdev; - /* XXX: create an anonymous device for multi device filesystems */ - sb->s_bdev = bdev; - sb->s_dev = bdev->bd_dev; - break; + /* XXX: create an anonymous device for multi device filesystems */ + sb->s_bdev = bdev; + sb->s_dev = bdev->bd_dev; + break; + } } - rcu_read_unlock(); c->dev = sb->s_dev; @@ -2547,9 +2564,10 @@ got_sb: sb->s_shrink->seeks = 0; #ifdef CONFIG_UNICODE - sb->s_encoding = c->cf_encoding; -#endif + if (bch2_fs_casefold_enabled(c)) + sb->s_encoding = c->cf_encoding; generic_set_sb_d_ops(sb); +#endif vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); ret = PTR_ERR_OR_ZERO(vinode); diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 49f46df8340e..15c1e890d299 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -12,6 +12,7 @@ #include "fs.h" #include "fsck.h" #include "inode.h" +#include "io_misc.h" #include "keylist.h" #include "namei.h" #include "recovery_passes.h" @@ -23,14 +24,15 @@ #include <linux/bsearch.h> #include <linux/dcache.h> /* struct qstr */ -static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d, +static int dirent_points_to_inode_nowarn(struct bch_fs *c, + struct bkey_s_c_dirent d, struct bch_inode_unpacked *inode) { if (d.v->d_type == DT_SUBVOL ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol : le64_to_cpu(d.v->d_inum) == inode->bi_inum) return 0; - return -BCH_ERR_ENOENT_dirent_doesnt_match_inode; + return bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); } static void dirent_inode_mismatch_msg(struct printbuf *out, @@ -49,7 +51,7 @@ static int dirent_points_to_inode(struct bch_fs *c, struct bkey_s_c_dirent dirent, struct bch_inode_unpacked *inode) { - int ret = dirent_points_to_inode_nowarn(dirent, inode); + int ret = dirent_points_to_inode_nowarn(c, dirent, inode); if (ret) { struct printbuf buf = PRINTBUF; dirent_inode_mismatch_msg(&buf, c, dirent, inode); @@ -152,7 +154,7 @@ static int find_snapshot_tree_subvol(struct btree_trans *trans, goto found; } } - ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol; + ret = bch_err_throw(trans->c, ENOENT_no_snapshot_tree_subvol); found: bch2_trans_iter_exit(trans, &iter); return ret; @@ -229,7 +231,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, if (d_type != DT_DIR) { bch_err(c, "error looking up lost+found: not a directory"); - return -BCH_ERR_ENOENT_not_directory; + return bch_err_throw(c, ENOENT_not_directory); } /* @@ -326,7 +328,8 @@ static inline bool inode_should_reattach(struct bch_inode_unpacked *inode) (inode->bi_flags & BCH_INODE_has_child_snapshot)) return false; - return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked); + return !bch2_inode_has_backpointer(inode) && + !(inode->bi_flags & BCH_INODE_unlinked); } static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot) @@ -371,6 +374,18 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * if (inode->bi_subvol) { inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; + struct btree_iter subvol_iter; + struct bkey_i_subvolume *subvol = + bch2_bkey_get_mut_typed(trans, &subvol_iter, + BTREE_ID_subvolumes, POS(0, inode->bi_subvol), + 0, subvolume); + ret = PTR_ERR_OR_ZERO(subvol); + if (ret) + return ret; + + subvol->v.fs_path_parent = BCACHEFS_ROOT_SUBVOL; + bch2_trans_iter_exit(trans, &subvol_iter); + u64 root_inum; ret = subvol_lookup(trans, inode->bi_parent_subvol, &dirent_snapshot, &root_inum); @@ -386,6 +401,8 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * if (ret) return ret; + bch_verbose(c, "got lostfound inum %llu", lostfound.bi_inum); + lostfound.bi_nlink += S_ISDIR(inode->bi_mode); /* ensure lost+found inode is also present in inode snapshot */ @@ -422,6 +439,16 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * if (ret) return ret; + { + CLASS(printbuf, buf)(); + ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, + inode->bi_snapshot, NULL, &buf); + if (ret) + return ret; + + bch_info(c, "reattached at %s", buf.buf); + } + /* * Fix up inodes in child snapshots: if they should also be reattached * update the backpointer field, if they should not be we need to emit @@ -489,13 +516,21 @@ static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, static int remove_backpointer(struct btree_trans *trans, struct bch_inode_unpacked *inode) { - if (!inode->bi_dir) + if (!bch2_inode_has_backpointer(inode)) return 0; + u32 snapshot = inode->bi_snapshot; + + if (inode->bi_parent_subvol) { + int ret = bch2_subvolume_get_snapshot(trans, inode->bi_parent_subvol, &snapshot); + if (ret) + return ret; + } + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter, - SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot)); + SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); int ret = bkey_err(d) ?: dirent_points_to_inode(c, d, inode) ?: bch2_fsck_remove_dirent(trans, d.k->p); @@ -531,7 +566,7 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub if (!bch2_snapshot_is_leaf(c, snapshotid)) { bch_err(c, "need to reconstruct subvol, but have interior node snapshot"); - return -BCH_ERR_fsck_repair_unimplemented; + return bch_err_throw(c, fsck_repair_unimplemented); } /* @@ -643,11 +678,6 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 return __bch2_fsck_write_inode(trans, &new_inode); } -struct snapshots_seen { - struct bpos pos; - snapshot_id_list ids; -}; - static inline void snapshots_seen_exit(struct snapshots_seen *s) { darray_exit(&s->ids); @@ -699,14 +729,8 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, u32 id, u32 ancestor) { - ssize_t i; - EBUG_ON(id > ancestor); - /* @ancestor should be the snapshot most recently added to @seen */ - EBUG_ON(ancestor != seen->pos.snapshot); - EBUG_ON(ancestor != darray_last(seen->ids)); - if (id == ancestor) return true; @@ -722,11 +746,8 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see * numerically, since snapshot ID lists are kept sorted, so if we find * an id that's an ancestor of @id we're done: */ - - for (i = seen->ids.nr - 2; - i >= 0 && seen->ids.data[i] >= id; - --i) - if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i])) + darray_for_each_reverse(seen->ids, i) + if (*i != ancestor && bch2_snapshot_is_ancestor(c, id, *i)) return false; return true; @@ -810,7 +831,7 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w, if (!n->whiteout) { return bch2_inode_unpack(inode, &n->inode); } else { - n->inode.bi_inum = inode.k->p.inode; + n->inode.bi_inum = inode.k->p.offset; n->inode.bi_snapshot = inode.k->p.snapshot; return 0; } @@ -890,14 +911,11 @@ lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, str { struct bch_fs *c = trans->c; - struct inode_walker_entry *i; - __darray_for_each(w->inodes, i) - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot)) - goto found; + struct inode_walker_entry *i = darray_find_p(w->inodes, i, + bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot)); - return NULL; -found: - BUG_ON(k.k->p.snapshot > i->inode.bi_snapshot); + if (!i) + return NULL; struct printbuf buf = PRINTBUF; int ret = 0; @@ -910,17 +928,15 @@ found: w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - struct bch_inode_unpacked new = i->inode; - struct bkey_i whiteout; - - new.bi_snapshot = k.k->p.snapshot; - if (!i->whiteout) { + struct bch_inode_unpacked new = i->inode; + new.bi_snapshot = k.k->p.snapshot; ret = __bch2_fsck_write_inode(trans, &new); } else { + struct bkey_i whiteout; bkey_init(&whiteout.k); whiteout.k.type = KEY_TYPE_whiteout; - whiteout.k.p = SPOS(0, i->inode.bi_inum, i->inode.bi_snapshot); + whiteout.k.p = SPOS(0, i->inode.bi_inum, k.k->p.snapshot); ret = bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, &whiteout, BTREE_UPDATE_internal_snapshot_node); @@ -947,7 +963,7 @@ found: if (ret) goto fsck_err; - ret = -BCH_ERR_transaction_restart_nested; + ret = bch_err_throw(c, transaction_restart_nested); goto fsck_err; } @@ -992,7 +1008,8 @@ int bch2_fsck_update_backpointers(struct btree_trans *trans, int ret = 0; if (d->v.d_type == DT_SUBVOL) { - BUG(); + bch_err(trans->c, "%s does not support DT_SUBVOL", __func__); + ret = -BCH_ERR_fsck_repair_unimplemented; } else { ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum)); if (ret) @@ -1048,7 +1065,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans, if (ret && !bch2_err_matches(ret, ENOENT)) return ret; - if ((ret || dirent_points_to_inode_nowarn(d, inode)) && + if ((ret || dirent_points_to_inode_nowarn(c, d, inode)) && inode->bi_subvol && (inode->bi_flags & BCH_INODE_has_child_snapshot)) { /* Older version of a renamed subvolume root: we won't have a @@ -1069,7 +1086,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans, trans, inode_points_to_missing_dirent, "inode points to missing dirent\n%s", (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) || - fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode), + fsck_err_on(!ret && dirent_points_to_inode_nowarn(c, d, inode), trans, inode_points_to_wrong_dirent, "%s", (printbuf_reset(&buf), @@ -1141,13 +1158,14 @@ static int check_inode(struct btree_trans *trans, if (ret) goto err; - if (u.bi_dir || u.bi_dir_offset) { + if (bch2_inode_has_backpointer(&u)) { ret = check_inode_dirent_inode(trans, &u, &do_update); if (ret) goto err; } - if (fsck_err_on(u.bi_dir && (u.bi_flags & BCH_INODE_unlinked), + if (fsck_err_on(bch2_inode_has_backpointer(&u) && + (u.bi_flags & BCH_INODE_unlinked), trans, inode_unlinked_but_has_dirent, "inode unlinked but has dirent\n%s", (printbuf_reset(&buf), @@ -1174,6 +1192,14 @@ static int check_inode(struct btree_trans *trans, ret = 0; } + if (fsck_err_on(S_ISDIR(u.bi_mode) && u.bi_size, + trans, inode_dir_has_nonzero_i_size, + "directory %llu:%u with nonzero i_size %lli", + u.bi_inum, u.bi_snapshot, u.bi_size)) { + u.bi_size = 0; + do_update = true; + } + ret = bch2_inode_has_child_snapshots(trans, k.k->p); if (ret < 0) goto err; @@ -1436,6 +1462,7 @@ static int check_key_has_inode(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; + struct btree_iter iter2 = {}; int ret = PTR_ERR_OR_ZERO(i); if (ret) return ret; @@ -1445,40 +1472,105 @@ static int check_key_has_inode(struct btree_trans *trans, bool have_inode = i && !i->whiteout; - if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { - ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - if (ret) - goto err; + if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) + goto reconstruct; - inode->last_pos.inode--; - ret = -BCH_ERR_transaction_restart_nested; - goto err; + if (have_inode && btree_matches_i_mode(iter->btree_id, i->inode.bi_mode)) + goto out; + + prt_printf(&buf, ", "); + + bool have_old_inode = false; + darray_for_each(inode->inodes, i2) + if (!i2->whiteout && + bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i2->inode.bi_snapshot) && + btree_matches_i_mode(iter->btree_id, i2->inode.bi_mode)) { + prt_printf(&buf, "but found good inode in older snapshot\n"); + bch2_inode_unpacked_to_text(&buf, &i2->inode); + prt_newline(&buf); + have_old_inode = true; + break; + } + + struct bkey_s_c k2; + unsigned nr_keys = 0; + + prt_printf(&buf, "found keys:\n"); + + for_each_btree_key_max_norestart(trans, iter2, iter->btree_id, + SPOS(k.k->p.inode, 0, k.k->p.snapshot), + POS(k.k->p.inode, U64_MAX), + 0, k2, ret) { + nr_keys++; + if (nr_keys <= 10) { + bch2_bkey_val_to_text(&buf, c, k2); + prt_newline(&buf); + } + if (nr_keys >= 100) + break; } - if (fsck_err_on(!have_inode, - trans, key_in_missing_inode, - "key in missing inode:\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - goto delete; + if (ret) + goto err; - if (fsck_err_on(have_inode && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), - trans, key_in_wrong_inode_type, - "key for wrong inode mode %o:\n%s", - i->inode.bi_mode, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - goto delete; + if (nr_keys > 100) + prt_printf(&buf, "found > %u keys for this missing inode\n", nr_keys); + else if (nr_keys > 10) + prt_printf(&buf, "found %u keys for this missing inode\n", nr_keys); + + if (!have_inode) { + if (fsck_err_on(!have_inode, + trans, key_in_missing_inode, + "key in missing inode%s", buf.buf)) { + /* + * Maybe a deletion that raced with data move, or something + * weird like that? But if we know the inode was deleted, or + * it's just a few keys, we can safely delete them. + * + * If it's many keys, we should probably recreate the inode + */ + if (have_old_inode || nr_keys <= 2) + goto delete; + else + goto reconstruct; + } + } else { + /* + * not autofix, this one would be a giant wtf - bit error in the + * inode corrupting i_mode? + * + * may want to try repairing inode instead of deleting + */ + if (fsck_err_on(!btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), + trans, key_in_wrong_inode_type, + "key for wrong inode mode %o%s", + i->inode.bi_mode, buf.buf)) + goto delete; + } out: err: fsck_err: + bch2_trans_iter_exit(trans, &iter2); printbuf_exit(&buf); bch_err_fn(c, ret); return ret; delete: + /* + * XXX: print out more info + * count up extents for this inode, check if we have different inode in + * an older snapshot version, perhaps decide if we want to reconstitute + */ ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); goto out; +reconstruct: + ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto err; + + inode->last_pos.inode--; + ret = bch_err_throw(c, transaction_restart_nested); + goto out; } static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w) @@ -1569,7 +1661,7 @@ static int extent_ends_at(struct bch_fs *c, sizeof(seen->ids.data[0]) * seen->ids.size, GFP_KERNEL); if (!n.seen.ids.data) - return -BCH_ERR_ENOMEM_fsck_extent_ends_at; + return bch_err_throw(c, ENOMEM_fsck_extent_ends_at); __darray_for_each(extent_ends->e, i) { if (i->snapshot == k.k->p.snapshot) { @@ -1619,7 +1711,7 @@ static int overlapping_extents_found(struct btree_trans *trans, bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", __func__, buf.buf); - ret = -BCH_ERR_internal_fsck_err; + ret = bch_err_throw(c, internal_fsck_err); goto err; } @@ -1644,7 +1736,7 @@ static int overlapping_extents_found(struct btree_trans *trans, pos2.size != k2.k->size) { bch_err(c, "%s: error finding seconding overlapping extent when repairing%s", __func__, buf.buf); - ret = -BCH_ERR_internal_fsck_err; + ret = bch_err_throw(c, internal_fsck_err); goto err; } @@ -1692,7 +1784,7 @@ static int overlapping_extents_found(struct btree_trans *trans, * We overwrote the second extent - restart * check_extent() from the top: */ - ret = -BCH_ERR_transaction_restart_nested; + ret = bch_err_throw(c, transaction_restart_nested); } } fsck_err: @@ -1820,20 +1912,20 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) continue; - if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && + u64 last_block = round_up(i->inode.bi_size, block_bytes(c)) >> 9; + + if (fsck_err_on(k.k->p.offset > last_block && !bkey_extent_is_reservation(k), trans, extent_past_end_of_inode, "extent type past end of inode %llu:%u, i_size %llu\n%s", i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - struct btree_iter iter2; - - bch2_trans_copy_iter(trans, &iter2, iter); - bch2_btree_iter_set_snapshot(trans, &iter2, i->inode.bi_snapshot); - ret = bch2_btree_iter_traverse(trans, &iter2) ?: - bch2_btree_delete_at(trans, &iter2, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &iter2); + ret = snapshots_seen_add_inorder(c, s, i->inode.bi_snapshot) ?: + bch2_fpunch_snapshot(trans, + SPOS(i->inode.bi_inum, + last_block, + i->inode.bi_snapshot), + POS(i->inode.bi_inum, U64_MAX)); if (ret) goto err; @@ -1947,14 +2039,22 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ continue; } - if (fsck_err_on(i->inode.bi_nlink != i->count, - trans, inode_dir_wrong_nlink, - "directory %llu:%u with wrong i_nlink: got %u, should be %llu", - w->last_pos.inode, i->inode.bi_snapshot, i->inode.bi_nlink, i->count)) { - i->inode.bi_nlink = i->count; - ret = bch2_fsck_write_inode(trans, &i->inode); - if (ret) - break; + if (i->inode.bi_nlink != i->count) { + CLASS(printbuf, buf)(); + + lockrestart_do(trans, + bch2_inum_snapshot_to_path(trans, w->last_pos.inode, + i->inode.bi_snapshot, NULL, &buf)); + + if (fsck_err_on(i->inode.bi_nlink != i->count, + trans, inode_dir_wrong_nlink, + "directory with wrong i_nlink: got %u, should be %llu\n%s", + i->inode.bi_nlink, i->count, buf.buf)) { + i->inode.bi_nlink = i->count; + ret = bch2_fsck_write_inode(trans, &i->inode); + if (ret) + break; + } } } fsck_err: @@ -2045,7 +2145,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { if (!new_parent_subvol) { bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot); - return -BCH_ERR_fsck_repair_unimplemented; + return bch_err_throw(c, fsck_repair_unimplemented); } struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); @@ -2107,7 +2207,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * if (ret) { bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); - ret = -BCH_ERR_fsck_repair_unimplemented; + ret = bch_err_throw(c, fsck_repair_unimplemented); goto err; } @@ -2139,7 +2239,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bch_hash_info *hash_info, struct inode_walker *dir, struct inode_walker *target, - struct snapshots_seen *s) + struct snapshots_seen *s, + bool *need_second_pass) { struct bch_fs *c = trans->c; struct inode_walker_entry *i; @@ -2181,7 +2282,10 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &i->inode); dir->first_this_inode = false; - ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k); + hash_info->cf_encoding = bch2_inode_casefold(c, &i->inode) ? c->cf_encoding : NULL; + + ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, + iter, k, need_second_pass); if (ret < 0) goto err; if (ret) { @@ -2202,31 +2306,34 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - struct qstr name = bch2_dirent_get_name(d); - u32 subvol = d.v->d_type == DT_SUBVOL - ? le32_to_cpu(d.v->d_parent_subvol) - : 0; + subvol_inum dir_inum = { .subvol = d.v->d_type == DT_SUBVOL + ? le32_to_cpu(d.v->d_parent_subvol) + : 0, + }; u64 target = d.v->d_type == DT_SUBVOL ? le32_to_cpu(d.v->d_child_subvol) : le64_to_cpu(d.v->d_inum); - u64 dir_offset; + struct qstr name = bch2_dirent_get_name(d); + + struct bkey_i_dirent *new_d = + bch2_dirent_create_key(trans, hash_info, dir_inum, + d.v->d_type, &name, NULL, target); + ret = PTR_ERR_OR_ZERO(new_d); + if (ret) + goto out; + + new_d->k.p.inode = d.k->p.inode; + new_d->k.p.snapshot = d.k->p.snapshot; - ret = bch2_hash_delete_at(trans, + struct btree_iter dup_iter = {}; + ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, hash_info, iter, BTREE_UPDATE_internal_snapshot_node) ?: - bch2_dirent_create_snapshot(trans, subvol, - d.k->p.inode, d.k->p.snapshot, - hash_info, - d.v->d_type, - &name, - target, - &dir_offset, - BTREE_ITER_with_updates| - BTREE_UPDATE_internal_snapshot_node| - STR_HASH_must_create) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - - /* might need another check_dirents pass */ + bch2_str_hash_repair_key(trans, s, + &bch2_dirent_hash_desc, hash_info, + iter, bkey_i_to_s_c(&new_d->k_i), + &dup_iter, bkey_s_c_null, + need_second_pass); goto out; } @@ -2294,7 +2401,6 @@ out: err: fsck_err: printbuf_exit(&buf); - bch_err_fn(c, ret); return ret; } @@ -2308,16 +2414,31 @@ int bch2_check_dirents(struct bch_fs *c) struct inode_walker target = inode_walker_init(); struct snapshots_seen s; struct bch_hash_info hash_info; + bool need_second_pass = false, did_second_pass = false; + int ret; snapshots_seen_init(&s); - - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_dirents, +again: + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, POS(BCACHEFS_ROOT_INO, 0), BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?: + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s, + &need_second_pass)) ?: check_subdir_count_notnested(trans, &dir)); + if (!ret && need_second_pass && !did_second_pass) { + bch_info(c, "check_dirents requires second pass"); + swap(did_second_pass, need_second_pass); + goto again; + } + + if (!ret && need_second_pass) { + bch_err(c, "dirents not repairing"); + ret = -EINVAL; + } + snapshots_seen_exit(&s); inode_walker_exit(&dir); inode_walker_exit(&target); @@ -2331,16 +2452,14 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, struct inode_walker *inode) { struct bch_fs *c = trans->c; - struct inode_walker_entry *i; - int ret; - ret = bch2_check_key_has_snapshot(trans, iter, k); + int ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret < 0) return ret; if (ret) return 0; - i = walk_inode(trans, inode, k); + struct inode_walker_entry *i = walk_inode(trans, inode, k); ret = PTR_ERR_OR_ZERO(i); if (ret) return ret; @@ -2356,9 +2475,9 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &i->inode); inode->first_this_inode = false; - ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k); - bch_err_fn(c, ret); - return ret; + bool need_second_pass = false; + return bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, + iter, k, &need_second_pass); } /* @@ -2470,6 +2589,11 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, if (k.k->type != KEY_TYPE_subvolume) return 0; + subvol_inum start = { + .subvol = k.k->p.offset, + .inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode), + }; + while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) { ret = darray_push(&subvol_path, k.k->p.offset); if (ret) @@ -2488,11 +2612,11 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, if (darray_u32_has(&subvol_path, parent)) { printbuf_reset(&buf); - prt_printf(&buf, "subvolume loop:\n"); + prt_printf(&buf, "subvolume loop: "); - darray_for_each_reverse(subvol_path, i) - prt_printf(&buf, "%u ", *i); - prt_printf(&buf, "%u", parent); + ret = bch2_inum_to_path(trans, start, &buf); + if (ret) + goto err; if (fsck_err(trans, subvol_loop, "%s", buf.buf)) ret = reattach_subvol(trans, s); @@ -2536,19 +2660,13 @@ int bch2_check_subvolume_structure(struct bch_fs *c) return ret; } -struct pathbuf_entry { - u64 inum; - u32 snapshot; -}; - -typedef DARRAY(struct pathbuf_entry) pathbuf; - -static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p, +static int bch2_bi_depth_renumber_one(struct btree_trans *trans, + u64 inum, u32 snapshot, u32 new_depth) { struct btree_iter iter; struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, p->inum, p->snapshot), 0); + SPOS(0, inum, snapshot), 0); struct bch_inode_unpacked inode; int ret = bkey_err(k) ?: @@ -2567,14 +2685,15 @@ err: return ret; } -static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth) +static int bch2_bi_depth_renumber(struct btree_trans *trans, darray_u64 *path, + u32 snapshot, u32 new_bi_depth) { u32 restart_count = trans->restart_count; int ret = 0; darray_for_each_reverse(*path, i) { ret = nested_lockrestart_do(trans, - bch2_bi_depth_renumber_one(trans, i, new_bi_depth)); + bch2_bi_depth_renumber_one(trans, *i, snapshot, new_bi_depth)); bch_err_fn(trans->c, ret); if (ret) break; @@ -2585,37 +2704,36 @@ static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 return ret ?: trans_was_restarted(trans, restart_count); } -static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) -{ - darray_for_each(*p, i) - if (i->inum == inum && - i->snapshot == snapshot) - return true; - return false; -} - static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) { struct bch_fs *c = trans->c; struct btree_iter inode_iter = {}; - pathbuf path = {}; + darray_u64 path = {}; struct printbuf buf = PRINTBUF; u32 snapshot = inode_k.k->p.snapshot; bool redo_bi_depth = false; u32 min_bi_depth = U32_MAX; int ret = 0; + struct bpos start = inode_k.k->p; + struct bch_inode_unpacked inode; ret = bch2_inode_unpack(inode_k, &inode); if (ret) return ret; - while (!inode.bi_subvol) { + /* + * If we're running full fsck, check_dirents() will have already ran, + * and we shouldn't see any missing backpointers here - otherwise that's + * handled separately, by check_unreachable_inodes + */ + while (!inode.bi_subvol && + bch2_inode_has_backpointer(&inode)) { struct btree_iter dirent_iter; struct bkey_s_c_dirent d; - u32 parent_snapshot = snapshot; - d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot); + d = dirent_get_by_pos(trans, &dirent_iter, + SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot)); ret = bkey_err(d.s_c); if (ret && !bch2_err_matches(ret, ENOENT)) goto out; @@ -2633,15 +2751,10 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) bch2_trans_iter_exit(trans, &dirent_iter); - ret = darray_push(&path, ((struct pathbuf_entry) { - .inum = inode.bi_inum, - .snapshot = snapshot, - })); + ret = darray_push(&path, inode.bi_inum); if (ret) return ret; - snapshot = parent_snapshot; - bch2_trans_iter_exit(trans, &inode_iter); inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, SPOS(0, inode.bi_dir, snapshot), 0); @@ -2663,21 +2776,28 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) break; inode = parent_inode; - snapshot = inode_k.k->p.snapshot; redo_bi_depth = true; - if (path_is_dup(&path, inode.bi_inum, snapshot)) { + if (darray_find(path, inode.bi_inum)) { printbuf_reset(&buf); - prt_printf(&buf, "directory structure loop:\n"); - darray_for_each_reverse(path, i) - prt_printf(&buf, "%llu:%u ", i->inum, i->snapshot); - prt_printf(&buf, "%llu:%u", inode.bi_inum, snapshot); + prt_printf(&buf, "directory structure loop in snapshot %u: ", + snapshot); + + ret = bch2_inum_snapshot_to_path(trans, start.offset, start.snapshot, NULL, &buf); + if (ret) + goto out; + + if (c->opts.verbose) { + prt_newline(&buf); + darray_for_each(path, i) + prt_printf(&buf, "%llu ", *i); + } if (fsck_err(trans, dir_loop, "%s", buf.buf)) { ret = remove_backpointer(trans, &inode); bch_err_msg(c, ret, "removing dirent"); if (ret) - break; + goto out; ret = reattach_inode(trans, &inode); bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); @@ -2691,7 +2811,7 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) min_bi_depth = 0; if (redo_bi_depth) - ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth); + ret = bch2_bi_depth_renumber(trans, &path, snapshot, min_bi_depth); out: fsck_err: bch2_trans_iter_exit(trans, &inode_iter); @@ -2708,7 +2828,7 @@ fsck_err: int bch2_check_directory_structure(struct bch_fs *c) { int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, + for_each_btree_key_reverse_commit(trans, iter, BTREE_ID_inodes, POS_MIN, BTREE_ITER_intent| BTREE_ITER_prefetch| BTREE_ITER_all_snapshots, k, @@ -2747,7 +2867,7 @@ static int add_nlink(struct bch_fs *c, struct nlink_table *t, if (!d) { bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", new_size); - return -BCH_ERR_ENOMEM_fsck_add_nlink; + return bch_err_throw(c, ENOMEM_fsck_add_nlink); } if (t->d) diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h index 574948278cd4..e5fe7cf7b251 100644 --- a/fs/bcachefs/fsck.h +++ b/fs/bcachefs/fsck.h @@ -4,6 +4,12 @@ #include "str_hash.h" +/* recoverds snapshot IDs of overwrites at @pos */ +struct snapshots_seen { + struct bpos pos; + snapshot_id_list ids; +}; + int bch2_fsck_update_backpointers(struct btree_trans *, struct snapshots_seen *, const struct bch_hash_desc, diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 5cf70108ae2f..ef4cc7395b86 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -38,6 +38,7 @@ static const char * const bch2_inode_flag_strs[] = { #undef x static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos); +static int may_delete_deleted_inum(struct btree_trans *, subvol_inum); static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; @@ -1041,7 +1042,7 @@ again: goto found_slot; if (!ret && start == min) - ret = -BCH_ERR_ENOSPC_inode_create; + ret = bch_err_throw(trans->c, ENOSPC_inode_create); if (ret) { bch2_trans_iter_exit(trans, iter); @@ -1130,19 +1131,23 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) u32 snapshot; int ret; + ret = lockrestart_do(trans, may_delete_deleted_inum(trans, inum)); + if (ret) + goto err2; + /* * If this was a directory, there shouldn't be any real dirents left - * but there could be whiteouts (from hash collisions) that we should * delete: * - * XXX: the dirent could ideally would delete whiteouts when they're no + * XXX: the dirent code ideally would delete whiteouts when they're no * longer needed */ ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?: bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?: bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents); if (ret) - goto err; + goto err2; retry: bch2_trans_begin(trans); @@ -1161,7 +1166,7 @@ retry: bch2_fs_inconsistent(c, "inode %llu:%u not found when deleting", inum.inum, snapshot); - ret = -BCH_ERR_ENOENT_inode; + ret = bch_err_throw(c, ENOENT_inode); goto err; } @@ -1260,7 +1265,14 @@ int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum, { struct bch_fs *c = trans->c; -#ifdef CONFIG_UNICODE +#ifndef CONFIG_UNICODE + bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE"); + return -EOPNOTSUPP; +#endif + + if (c->opts.casefold_disabled) + return -EOPNOTSUPP; + int ret = 0; /* Not supported on individual files. */ if (!S_ISDIR(bi->bi_mode)) @@ -1284,10 +1296,6 @@ int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum, bi->bi_fields_set |= BIT(Inode_opt_casefold); return bch2_maybe_propagate_has_case_insensitive(trans, inum, bi); -#else - bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE"); - return -EOPNOTSUPP; -#endif } static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) @@ -1328,7 +1336,7 @@ retry: bch2_fs_inconsistent(c, "inode %llu:%u not found when deleting", inum, snapshot); - ret = -BCH_ERR_ENOENT_inode; + ret = bch_err_throw(c, ENOENT_inode); goto err; } @@ -1392,10 +1400,8 @@ int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot)); } -static int may_delete_deleted_inode(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos pos, - bool *need_another_pass) +static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos, + bool from_deleted_inodes) { struct bch_fs *c = trans->c; struct btree_iter inode_iter; @@ -1409,12 +1415,14 @@ static int may_delete_deleted_inode(struct btree_trans *trans, if (ret) return ret; - ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; - if (fsck_err_on(!bkey_is_inode(k.k), + ret = bkey_is_inode(k.k) ? 0 : bch_err_throw(c, ENOENT_inode); + if (fsck_err_on(from_deleted_inodes && ret, trans, deleted_inode_missing, "nonexistent inode %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; + if (ret) + goto out; ret = bch2_inode_unpack(k, &inode); if (ret) @@ -1422,7 +1430,8 @@ static int may_delete_deleted_inode(struct btree_trans *trans, if (S_ISDIR(inode.bi_mode)) { ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot); - if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY), + if (fsck_err_on(from_deleted_inodes && + bch2_err_matches(ret, ENOTEMPTY), trans, deleted_inode_is_dir, "non empty directory %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) @@ -1431,17 +1440,25 @@ static int may_delete_deleted_inode(struct btree_trans *trans, goto out; } - if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), + ret = inode.bi_flags & BCH_INODE_unlinked ? 0 : bch_err_throw(c, inode_not_unlinked); + if (fsck_err_on(from_deleted_inodes && ret, trans, deleted_inode_not_unlinked, "non-deleted inode %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; + if (ret) + goto out; + + ret = !(inode.bi_flags & BCH_INODE_has_child_snapshot) + ? 0 : bch_err_throw(c, inode_has_child_snapshot); - if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot, + if (fsck_err_on(from_deleted_inodes && ret, trans, deleted_inode_has_child_snapshots, "inode with child snapshots %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; + if (ret) + goto out; ret = bch2_inode_has_child_snapshots(trans, k.k->p); if (ret < 0) @@ -1458,19 +1475,28 @@ static int may_delete_deleted_inode(struct btree_trans *trans, if (ret) goto out; } + + if (!from_deleted_inodes) { + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + bch_err_throw(c, inode_has_child_snapshot); + goto out; + } + goto delete; } - if (test_bit(BCH_FS_clean_recovery, &c->flags) && - !fsck_err(trans, deleted_inode_but_clean, - "filesystem marked as clean but have deleted inode %llu:%u", - pos.offset, pos.snapshot)) { - ret = 0; - goto out; - } + if (from_deleted_inodes) { + if (test_bit(BCH_FS_clean_recovery, &c->flags) && + !fsck_err(trans, deleted_inode_but_clean, + "filesystem marked as clean but have deleted inode %llu:%u", + pos.offset, pos.snapshot)) { + ret = 0; + goto out; + } - ret = 1; + ret = 1; + } out: fsck_err: bch2_trans_iter_exit(trans, &inode_iter); @@ -1481,12 +1507,19 @@ delete: goto out; } +static int may_delete_deleted_inum(struct btree_trans *trans, subvol_inum inum) +{ + u32 snapshot; + + return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: + may_delete_deleted_inode(trans, SPOS(0, inum.inum, snapshot), false); +} + int bch2_delete_dead_inodes(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - bool need_another_pass; int ret; -again: + /* * if we ran check_inodes() unlinked inodes will have already been * cleaned up but the write buffer will be out of sync; therefore we @@ -1496,8 +1529,6 @@ again: if (ret) goto err; - need_another_pass = false; - /* * Weird transaction restart handling here because on successful delete, * bch2_inode_rm_snapshot() will return a nested transaction restart, @@ -1507,7 +1538,7 @@ again: ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass); + ret = may_delete_deleted_inode(trans, k.k->p, true); if (ret > 0) { bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot); @@ -1528,10 +1559,8 @@ again: ret; })); - - if (!ret && need_another_pass) - goto again; err: bch2_trans_put(trans); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 77ad2d549541..b8ec3e628d90 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -254,6 +254,11 @@ static inline bool bch2_inode_casefold(struct bch_fs *c, const struct bch_inode_ : c->opts.casefold; } +static inline bool bch2_inode_has_backpointer(const struct bch_inode_unpacked *bi) +{ + return bi->bi_dir || bi->bi_dir_offset; +} + /* i_nlink: */ static inline unsigned nlink_bias(umode_t mode) @@ -283,15 +288,6 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, int bch2_inode_nlink_inc(struct bch_inode_unpacked *); void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); -static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *inode) -{ - bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset; - - return S_ISDIR(inode->bi_mode) || - inode->bi_subvol || - (!inode->bi_nlink && inode_has_bp); -} - struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, struct bch_inode_unpacked *); diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index cc07729a4b62..07023667a475 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -91,7 +91,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, opts.data_replicas, BCH_WATERMARK_normal, 0, &cl, &wp); if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) - ret = -BCH_ERR_transaction_restart_nested; + ret = bch_err_throw(c, transaction_restart_nested); if (ret) goto err; @@ -135,6 +135,33 @@ err_noprint: return ret; } +/* For fsck */ +int bch2_fpunch_snapshot(struct btree_trans *trans, struct bpos start, struct bpos end) +{ + u32 restart_count = trans->restart_count; + struct bch_fs *c = trans->c; + struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); + unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); + struct bkey_i delete; + + int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, + start, end, 0, k, + &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + bkey_init(&delete.k); + delete.k.p = iter.pos; + + /* create the biggest key we can */ + bch2_key_resize(&delete.k, max_sectors); + bch2_cut_back(end, &delete); + + bch2_extent_trim_atomic(trans, &iter, &delete) ?: + bch2_trans_update(trans, &iter, &delete, 0); + })); + + bch2_disk_reservation_put(c, &disk_res); + return ret ?: trans_was_restarted(trans, restart_count); +} + /* * Returns -BCH_ERR_transacton_restart if we had to drop locks: */ diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h index 9cb44a7c43c1..b93e4d4b3c0c 100644 --- a/fs/bcachefs/io_misc.h +++ b/fs/bcachefs/io_misc.h @@ -5,6 +5,8 @@ int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *, u64, struct bch_io_opts, s64 *, struct write_point_specifier); + +int bch2_fpunch_snapshot(struct btree_trans *, struct bpos, struct bpos); int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, subvol_inum, u64, s64 *); int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index cc708d46557e..e0874ad9a6cf 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -56,7 +56,7 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target) if (!target) return false; - rcu_read_lock(); + guard(rcu)(); devs = bch2_target_to_mask(c, target) ?: &c->rw_devs[BCH_DATA_user]; @@ -73,7 +73,6 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target) total += max(congested, 0LL); nr++; } - rcu_read_unlock(); return get_random_u32_below(nr * CONGESTED_MAX) < total; } @@ -138,21 +137,21 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, BUG_ON(!opts.promote_target); if (!(flags & BCH_READ_may_promote)) - return -BCH_ERR_nopromote_may_not; + return bch_err_throw(c, nopromote_may_not); if (bch2_bkey_has_target(c, k, opts.promote_target)) - return -BCH_ERR_nopromote_already_promoted; + return bch_err_throw(c, nopromote_already_promoted); if (bkey_extent_is_unwritten(k)) - return -BCH_ERR_nopromote_unwritten; + return bch_err_throw(c, nopromote_unwritten); if (bch2_target_congested(c, opts.promote_target)) - return -BCH_ERR_nopromote_congested; + return bch_err_throw(c, nopromote_congested); } if (rhashtable_lookup_fast(&c->promote_table, &pos, bch_promote_params)) - return -BCH_ERR_nopromote_in_flight; + return bch_err_throw(c, nopromote_in_flight); return 0; } @@ -167,6 +166,7 @@ static noinline void promote_free(struct bch_read_bio *rbio) BUG_ON(ret); async_object_list_del(c, promote, op->list_idx); + async_object_list_del(c, rbio, rbio->list_idx); bch2_data_update_exit(&op->write); @@ -240,7 +240,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); if (!op) { - ret = -BCH_ERR_nopromote_enomem; + ret = bch_err_throw(c, nopromote_enomem); goto err_put; } @@ -249,7 +249,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, bch_promote_params)) { - ret = -BCH_ERR_nopromote_in_flight; + ret = bch_err_throw(c, nopromote_in_flight); goto err; } @@ -344,6 +344,10 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans, *bounce = true; *read_full = promote_full; + + if (have_io_error(failed)) + orig->self_healing = true; + return promote; nopromote: trace_io_read_nopromote(c, ret); @@ -453,6 +457,10 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) if (rbio->start_time) bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], rbio->start_time); +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + if (rbio->list_idx) + async_object_list_del(rbio->c, rbio, rbio->list_idx); +#endif bio_endio(&rbio->bio); } @@ -545,7 +553,7 @@ retry: if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { /* extent we wanted to read no longer exists: */ - rbio->ret = -BCH_ERR_data_read_key_overwritten; + rbio->ret = bch_err_throw(trans->c, data_read_key_overwritten); goto err; } @@ -636,12 +644,15 @@ static void bch2_rbio_retry(struct work_struct *work) prt_str(&buf, "(internal move) "); prt_str(&buf, "data read error, "); - if (!ret) + if (!ret) { prt_str(&buf, "successful retry"); - else + if (rbio->self_healing) + prt_str(&buf, ", self healing"); + } else prt_str(&buf, bch2_err_str(ret)); prt_newline(&buf); + if (!bkey_deleted(&sk.k->k)) { bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k)); prt_newline(&buf); @@ -1036,7 +1047,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) && !orig->data_update) - return -BCH_ERR_extent_poisoned; + return bch_err_throw(c, extent_poisoned); retry_pick: ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); @@ -1074,7 +1085,7 @@ retry_pick: bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); - ret = -BCH_ERR_data_read_no_encryption_key; + ret = bch_err_throw(c, data_read_no_encryption_key); goto err; } @@ -1128,7 +1139,7 @@ retry_pick: if (ca) enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); - rbio->ret = -BCH_ERR_data_read_buffer_too_small; + rbio->ret = bch_err_throw(c, data_read_buffer_too_small); goto out_read_done; } @@ -1333,7 +1344,7 @@ hole: * have to signal that: */ if (u) - orig->ret = -BCH_ERR_data_read_key_overwritten; + orig->ret = bch_err_throw(c, data_read_key_overwritten); zero_fill_bio_iter(&orig->bio, iter); out_read_done: @@ -1485,7 +1496,12 @@ void bch2_read_bio_to_text(struct printbuf *out, struct bch_read_bio *rbio) prt_printf(out, "have_ioref:\t%u\n", rbio->have_ioref); prt_printf(out, "narrow_crcs:\t%u\n", rbio->narrow_crcs); prt_printf(out, "context:\t%u\n", rbio->context); - prt_printf(out, "ret:\t%s\n", bch2_err_str(rbio->ret)); + + int ret = READ_ONCE(rbio->ret); + if (ret < 0) + prt_printf(out, "ret:\t%s\n", bch2_err_str(ret)); + else + prt_printf(out, "ret:\t%i\n", ret); prt_printf(out, "flags:\t"); bch2_prt_bitflags(out, bch2_read_bio_flags, rbio->flags); @@ -1510,18 +1526,18 @@ int bch2_fs_io_read_init(struct bch_fs *c) c->opts.btree_node_size, c->opts.encoded_extent_max) / PAGE_SIZE, 0)) - return -BCH_ERR_ENOMEM_bio_bounce_pages_init; + return bch_err_throw(c, ENOMEM_bio_bounce_pages_init); if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_bio_read_init; + return bch_err_throw(c, ENOMEM_bio_read_init); if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_bio_read_split_init; + return bch_err_throw(c, ENOMEM_bio_read_split_init); if (rhashtable_init(&c->promote_table, &bch_promote_params)) - return -BCH_ERR_ENOMEM_promote_table_init; + return bch_err_throw(c, ENOMEM_promote_table_init); return 0; } diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index c08b9c047b3e..9c5ddbf861b3 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -44,6 +44,7 @@ struct bch_read_bio { have_ioref:1, narrow_crcs:1, saw_error:1, + self_healing:1, context:2; }; u16 _state; @@ -91,6 +92,8 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans, return 0; *data_btree = BTREE_ID_reflink; + + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, offset_into_extent, @@ -102,10 +105,10 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans, if (bkey_deleted(k.k)) { bch2_trans_iter_exit(trans, &iter); - return -BCH_ERR_missing_indirect_extent; + return bch_err_throw(c, missing_indirect_extent); } - bch2_bkey_buf_reassemble(extent, trans->c, k); + bch2_bkey_buf_reassemble(extent, c, k); bch2_trans_iter_exit(trans, &iter); return 0; } diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 52a60982a66b..88b1eec8eff3 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -558,6 +558,7 @@ static void bch2_write_done(struct closure *cl) static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) { + struct bch_fs *c = op->c; struct keylist *keys = &op->insert_keys; struct bkey_i *src, *dst = keys->keys, *n; @@ -569,7 +570,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) test_bit(ptr->dev, op->failed.d)); if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) - return -BCH_ERR_data_write_io; + return bch_err_throw(c, data_write_io); } if (dst != src) @@ -976,7 +977,7 @@ csum_err: op->crc.csum_type < BCH_CSUM_NR ? __bch2_csum_types[op->crc.csum_type] : "(unknown)"); - return -BCH_ERR_data_write_csum; + return bch_err_throw(c, data_write_csum); } static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, @@ -1208,16 +1209,13 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op, e = bkey_s_c_to_extent(k); - rcu_read_lock(); + guard(rcu)(); extent_for_each_ptr_decode(e, p, entry) { - if (crc_is_encoded(p.crc) || p.has_ec) { - rcu_read_unlock(); + if (crc_is_encoded(p.crc) || p.has_ec) return false; - } replicas += bch2_extent_ptr_durability(c, &p); } - rcu_read_unlock(); return replicas >= op->opts.data_replicas; } @@ -1290,7 +1288,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) static void __bch2_nocow_write_done(struct bch_write_op *op) { if (unlikely(op->flags & BCH_WRITE_io_error)) { - op->error = -BCH_ERR_data_write_io; + op->error = bch_err_throw(op->c, data_write_io); } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten)) bch2_nocow_write_convert_unwritten(op); } @@ -1483,10 +1481,10 @@ err_bucket_stale: "pointer to invalid bucket in nocow path on device %llu\n %s", stale_at->b.inode, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = -BCH_ERR_data_write_invalid_ptr; + ret = bch_err_throw(c, data_write_invalid_ptr); } else { /* We can retry this: */ - ret = -BCH_ERR_transaction_restart; + ret = bch_err_throw(c, transaction_restart); } printbuf_exit(&buf); @@ -1693,18 +1691,18 @@ CLOSURE_CALLBACK(bch2_write) if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { bch2_write_op_error(op, op->pos.offset, "misaligned write"); - op->error = -BCH_ERR_data_write_misaligned; + op->error = bch_err_throw(c, data_write_misaligned); goto err; } if (c->opts.nochanges) { - op->error = -BCH_ERR_erofs_no_writes; + op->error = bch_err_throw(c, erofs_no_writes); goto err; } if (!(op->flags & BCH_WRITE_move) && !enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_write)) { - op->error = -BCH_ERR_erofs_no_writes; + op->error = bch_err_throw(c, erofs_no_writes); goto err; } @@ -1776,7 +1774,7 @@ int bch2_fs_io_write_init(struct bch_fs *c) { if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) || bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0)) - return -BCH_ERR_ENOMEM_bio_write_init; + return bch_err_throw(c, ENOMEM_bio_write_init); return 0; } diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 09b70fd140a1..ddfeb0dafc9d 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -397,7 +397,7 @@ static int journal_entry_open(struct journal *j) BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); if (j->blocked) - return -BCH_ERR_journal_blocked; + return bch_err_throw(c, journal_blocked); if (j->cur_entry_error) return j->cur_entry_error; @@ -407,23 +407,23 @@ static int journal_entry_open(struct journal *j) return ret; if (!fifo_free(&j->pin)) - return -BCH_ERR_journal_pin_full; + return bch_err_throw(c, journal_pin_full); if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) - return -BCH_ERR_journal_max_in_flight; + return bch_err_throw(c, journal_max_in_flight); if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR) - return -BCH_ERR_journal_max_open; + return bch_err_throw(c, journal_max_open); if (unlikely(journal_cur_seq(j) >= JOURNAL_SEQ_MAX)) { bch_err(c, "cannot start: journal seq overflow"); if (bch2_fs_emergency_read_only_locked(c)) bch_err(c, "fatal error - emergency read only"); - return -BCH_ERR_journal_shutdown; + return bch_err_throw(c, journal_shutdown); } if (!j->free_buf && !buf->data) - return -BCH_ERR_journal_buf_enomem; /* will retry after write completion frees up a buf */ + return bch_err_throw(c, journal_buf_enomem); /* will retry after write completion frees up a buf */ BUG_ON(!j->cur_entry_sectors); @@ -447,7 +447,7 @@ static int journal_entry_open(struct journal *j) u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); if (u64s <= (ssize_t) j->early_journal_entries.nr) - return -BCH_ERR_journal_full; + return bch_err_throw(c, journal_full); if (fifo_empty(&j->pin) && j->reclaim_thread) wake_up_process(j->reclaim_thread); @@ -464,7 +464,7 @@ static int journal_entry_open(struct journal *j) journal_cur_seq(j)); if (bch2_fs_emergency_read_only_locked(c)) bch_err(c, "fatal error - emergency read only"); - return -BCH_ERR_journal_shutdown; + return bch_err_throw(c, journal_shutdown); } BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); @@ -597,16 +597,16 @@ retry: return ret; if (j->blocked) - return -BCH_ERR_journal_blocked; + return bch_err_throw(c, journal_blocked); if ((flags & BCH_WATERMARK_MASK) < j->watermark) { - ret = -BCH_ERR_journal_full; + ret = bch_err_throw(c, journal_full); can_discard = j->can_discard; goto out; } if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) { - ret = -BCH_ERR_journal_max_in_flight; + ret = bch_err_throw(c, journal_max_in_flight); goto out; } @@ -647,7 +647,7 @@ out: goto retry; if (journal_error_check_stuck(j, ret, flags)) - ret = -BCH_ERR_journal_stuck; + ret = bch_err_throw(c, journal_stuck); if (ret == -BCH_ERR_journal_max_in_flight && track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) && @@ -708,10 +708,9 @@ static unsigned max_dev_latency(struct bch_fs *c) { u64 nsecs = 0; - rcu_read_lock(); + guard(rcu)(); for_each_rw_member_rcu(c, ca) nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration); - rcu_read_unlock(); return nsecs_to_jiffies(nsecs); } @@ -813,6 +812,7 @@ out: int bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf; int ret = 0; @@ -828,7 +828,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, /* Recheck under lock: */ if (j->err_seq && seq >= j->err_seq) { - ret = -BCH_ERR_journal_flush_err; + ret = bch_err_throw(c, journal_flush_err); goto out; } @@ -999,7 +999,7 @@ int bch2_journal_meta(struct journal *j) struct bch_fs *c = container_of(j, struct bch_fs, journal); if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_journal)) - return -BCH_ERR_erofs_no_writes; + return bch_err_throw(c, erofs_no_writes); int ret = __bch2_journal_meta(j); enumerated_ref_put(&c->writes, BCH_WRITE_REF_journal); @@ -1082,6 +1082,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou if (open && !*blocked) { __bch2_journal_block(j); + s.v = atomic64_read_acquire(&j->reservations.counter); *blocked = true; } @@ -1132,7 +1133,7 @@ static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr, new_buckets = kcalloc(nr, sizeof(u64), GFP_KERNEL); new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL); if (!bu || !ob || !new_buckets || !new_bucket_seq) { - ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets; + ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets); goto err_free; } @@ -1283,7 +1284,7 @@ static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca ret = 0; /* wait and retry */ bch2_disk_reservation_put(c, &disk_res); - closure_sync(&cl); + bch2_wait_on_allocator(c, &cl); } return ret; @@ -1304,6 +1305,66 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, return ret; } +int bch2_dev_journal_bucket_delete(struct bch_dev *ca, u64 b) +{ + struct bch_fs *c = ca->fs; + struct journal *j = &c->journal; + struct journal_device *ja = &ca->journal; + + guard(mutex)(&c->sb_lock); + unsigned pos; + for (pos = 0; pos < ja->nr; pos++) + if (ja->buckets[pos] == b) + break; + + if (pos == ja->nr) { + bch_err(ca, "journal bucket %llu not found when deleting", b); + return -EINVAL; + } + + u64 *new_buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);; + if (!new_buckets) + return bch_err_throw(c, ENOMEM_set_nr_journal_buckets); + + memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); + memmove(&new_buckets[pos], + &new_buckets[pos + 1], + (ja->nr - 1 - pos) * sizeof(new_buckets[0])); + + int ret = bch2_journal_buckets_to_sb(c, ca, ja->buckets, ja->nr - 1) ?: + bch2_write_super(c); + if (ret) { + kfree(new_buckets); + return ret; + } + + scoped_guard(spinlock, &j->lock) { + if (pos < ja->discard_idx) + --ja->discard_idx; + if (pos < ja->dirty_idx_ondisk) + --ja->dirty_idx_ondisk; + if (pos < ja->dirty_idx) + --ja->dirty_idx; + if (pos < ja->cur_idx) + --ja->cur_idx; + + ja->nr--; + + memmove(&ja->buckets[pos], + &ja->buckets[pos + 1], + (ja->nr - pos) * sizeof(ja->buckets[0])); + + memmove(&ja->bucket_seq[pos], + &ja->bucket_seq[pos + 1], + (ja->nr - pos) * sizeof(ja->bucket_seq[0])); + + bch2_journal_space_available(j); + } + + kfree(new_buckets); + return 0; +} + int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) { struct bch_fs *c = ca->fs; @@ -1313,14 +1374,14 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { bch_err(c, "cannot allocate journal, filesystem is an unresized image file"); - return -BCH_ERR_erofs_filesystem_full; + return bch_err_throw(c, erofs_filesystem_full); } unsigned nr; int ret; if (dynamic_fault("bcachefs:add:journal_alloc")) { - ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets; + ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets); goto err; } @@ -1414,14 +1475,13 @@ void bch2_fs_journal_stop(struct journal *j) clear_bit(JOURNAL_running, &j->flags); } -int bch2_fs_journal_start(struct journal *j, u64 cur_seq) +int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_entry_pin_list *p; struct journal_replay *i, **_i; struct genradix_iter iter; bool had_entries = false; - u64 last_seq = cur_seq, nr, seq; /* * @@ -1435,17 +1495,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) return -EINVAL; } - genradix_for_each_reverse(&c->journal_entries, iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - last_seq = le64_to_cpu(i->j.last_seq); - break; - } + /* Clean filesystem? */ + if (!last_seq) + last_seq = cur_seq; - nr = cur_seq - last_seq; + u64 nr = cur_seq - last_seq; /* * Extra fudge factor, in case we crashed when the journal pin fifo was @@ -1459,7 +1513,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); if (!j->pin.data) { bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); - return -BCH_ERR_ENOMEM_journal_pin_fifo; + return bch_err_throw(c, ENOMEM_journal_pin_fifo); } j->replay_journal_seq = last_seq; @@ -1472,6 +1526,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) j->pin.back = cur_seq; atomic64_set(&j->seq, cur_seq - 1); + u64 seq; fifo_for_each_entry_ptr(p, &j->pin, seq) journal_pin_list_init(p, 1); @@ -1547,6 +1602,7 @@ void bch2_dev_journal_exit(struct bch_dev *ca) int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) { + struct bch_fs *c = ca->fs; struct journal_device *ja = &ca->journal; struct bch_sb_field_journal *journal_buckets = bch2_sb_field_get(sb, journal); @@ -1566,7 +1622,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); if (!ja->bucket_seq) - return -BCH_ERR_ENOMEM_dev_journal_init; + return bch_err_throw(c, ENOMEM_dev_journal_init); unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); @@ -1574,7 +1630,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, nr_bvecs), GFP_KERNEL); if (!ja->bio[i]) - return -BCH_ERR_ENOMEM_dev_journal_init; + return bch_err_throw(c, ENOMEM_dev_journal_init); ja->bio[i]->ca = ca; ja->bio[i]->buf_idx = i; @@ -1583,7 +1639,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); if (!ja->buckets) - return -BCH_ERR_ENOMEM_dev_journal_init; + return bch_err_throw(c, ENOMEM_dev_journal_init); if (journal_buckets_v2) { unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); @@ -1637,10 +1693,12 @@ void bch2_fs_journal_init_early(struct journal *j) int bch2_fs_journal_init(struct journal *j) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN; j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL); if (!j->free_buf) - return -BCH_ERR_ENOMEM_journal_buf; + return bch_err_throw(c, ENOMEM_journal_buf); for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) j->buf[i].idx = i; @@ -1648,7 +1706,7 @@ int bch2_fs_journal_init(struct journal *j) j->wq = alloc_workqueue("bcachefs_journal", WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512); if (!j->wq) - return -BCH_ERR_ENOMEM_fs_other_alloc; + return bch_err_throw(c, ENOMEM_fs_other_alloc); return 0; } @@ -1672,7 +1730,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) printbuf_tabstop_push(out, 28); out->atomic++; - rcu_read_lock(); + guard(rcu)(); s = READ_ONCE(j->reservations); prt_printf(out, "flags:\t"); @@ -1763,8 +1821,6 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required); - rcu_read_unlock(); - --out->atomic; } diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 8ff00a0ec778..977907038d98 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -444,15 +444,16 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); -int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, - unsigned nr); +int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned); +int bch2_dev_journal_bucket_delete(struct bch_dev *, u64); + int bch2_dev_journal_alloc(struct bch_dev *, bool); int bch2_fs_journal_alloc(struct bch_fs *); void bch2_dev_journal_stop(struct journal *, struct bch_dev *); void bch2_fs_journal_stop(struct journal *); -int bch2_fs_journal_start(struct journal *, u64); +int bch2_fs_journal_start(struct journal *, u64, u64); void bch2_journal_set_replay_done(struct journal *); void bch2_dev_journal_exit(struct bch_dev *); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 63bb207208b2..9e028dbcc3d0 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -49,25 +49,27 @@ void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) mutex_unlock(&c->sb_lock); } -void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - struct journal_replay *j) +static void bch2_journal_ptr_to_text(struct printbuf *out, struct bch_fs *c, struct journal_ptr *p) +{ + struct bch_dev *ca = bch2_dev_tryget_noerror(c, p->dev); + prt_printf(out, "%s %u:%u:%u (sector %llu)", + ca ? ca->name : "(invalid dev)", + p->dev, p->bucket, p->bucket_offset, p->sector); + bch2_dev_put(ca); +} + +void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j) { darray_for_each(j->ptrs, i) { if (i != j->ptrs.data) prt_printf(out, " "); - prt_printf(out, "%u:%u:%u (sector %llu)", - i->dev, i->bucket, i->bucket_offset, i->sector); + bch2_journal_ptr_to_text(out, c, i); } } -static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, - struct journal_replay *j) +static void bch2_journal_datetime_to_text(struct printbuf *out, struct jset *j) { - prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); - - bch2_journal_ptrs_to_text(out, c, j); - - for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) { + for_each_jset_entry_type(entry, j, BCH_JSET_ENTRY_datetime) { struct jset_entry_datetime *datetime = container_of(entry, struct jset_entry_datetime, entry); bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); @@ -75,6 +77,15 @@ static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, } } +static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, + struct journal_replay *j) +{ + prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); + bch2_journal_datetime_to_text(out, &j->j); + prt_char(out, ' '); + bch2_journal_ptrs_to_text(out, c, j); +} + static struct nonce journal_nonce(const struct jset *jset) { return (struct nonce) {{ @@ -149,6 +160,9 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, struct printbuf buf = PRINTBUF; int ret = JOURNAL_ENTRY_ADD_OK; + if (last_seq && c->opts.journal_rewind) + last_seq = min(last_seq, c->opts.journal_rewind); + if (!c->journal.oldest_seq_found_ondisk || le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); @@ -188,7 +202,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, journal_entry_radix_idx(c, le64_to_cpu(j->seq)), GFP_KERNEL); if (!_i) - return -BCH_ERR_ENOMEM_journal_entry_add; + return bch_err_throw(c, ENOMEM_journal_entry_add); /* * Duplicate journal entries? If so we want the one that didn't have a @@ -231,7 +245,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, replace: i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); if (!i) - return -BCH_ERR_ENOMEM_journal_entry_add; + return bch_err_throw(c, ENOMEM_journal_entry_add); darray_init(&i->ptrs); i->csum_good = entry_ptr.csum_good; @@ -311,7 +325,7 @@ static void journal_entry_err_msg(struct printbuf *out, bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ if (bch2_fs_inconsistent(c, \ "corrupt metadata before write: %s\n", _buf.buf)) {\ - ret = -BCH_ERR_fsck_errors_not_fixed; \ + ret = bch_err_throw(c, fsck_errors_not_fixed); \ goto fsck_err; \ } \ break; \ @@ -418,6 +432,10 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs bool first = true; jset_entry_for_each_key(entry, k) { + /* We may be called on entries that haven't been validated: */ + if (!k->k.u64s) + break; + if (!first) { prt_newline(out); bch2_prt_jset_entry_type(out, entry->type); @@ -1005,19 +1023,19 @@ struct journal_read_buf { size_t size; }; -static int journal_read_buf_realloc(struct journal_read_buf *b, +static int journal_read_buf_realloc(struct bch_fs *c, struct journal_read_buf *b, size_t new_size) { void *n; /* the bios are sized for this many pages, max: */ if (new_size > JOURNAL_ENTRY_SIZE_MAX) - return -BCH_ERR_ENOMEM_journal_read_buf_realloc; + return bch_err_throw(c, ENOMEM_journal_read_buf_realloc); new_size = roundup_pow_of_two(new_size); n = kvmalloc(new_size, GFP_KERNEL); if (!n) - return -BCH_ERR_ENOMEM_journal_read_buf_realloc; + return bch_err_throw(c, ENOMEM_journal_read_buf_realloc); kvfree(b->data); b->data = n; @@ -1037,7 +1055,6 @@ static int journal_read_bucket(struct bch_dev *ca, u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), end = offset + ca->mi.bucket_size; bool saw_bad = false, csum_good; - struct printbuf err = PRINTBUF; int ret = 0; pr_debug("reading %u", bucket); @@ -1053,7 +1070,7 @@ reread: bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); if (!bio) - return -BCH_ERR_ENOMEM_journal_read_bucket; + return bch_err_throw(c, ENOMEM_journal_read_bucket); bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); bio->bi_iter.bi_sector = offset; @@ -1064,7 +1081,7 @@ reread: kfree(bio); if (!ret && bch2_meta_read_fault("journal")) - ret = -BCH_ERR_EIO_fault_injected; + ret = bch_err_throw(c, EIO_fault_injected); bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !ret); @@ -1078,7 +1095,7 @@ reread: * found on a different device, and missing or * no journal entries will be handled later */ - goto out; + return 0; } j = buf->data; @@ -1092,15 +1109,15 @@ reread: break; case JOURNAL_ENTRY_REREAD: if (vstruct_bytes(j) > buf->size) { - ret = journal_read_buf_realloc(buf, + ret = journal_read_buf_realloc(c, buf, vstruct_bytes(j)); if (ret) - goto err; + return ret; } goto reread; case JOURNAL_ENTRY_NONE: if (!saw_bad) - goto out; + return 0; /* * On checksum error we don't really trust the size * field of the journal entry we read, so try reading @@ -1109,7 +1126,7 @@ reread: sectors = block_sectors(c); goto next_block; default: - goto err; + return ret; } if (le64_to_cpu(j->seq) > ja->highest_seq_found) { @@ -1126,22 +1143,20 @@ reread: * bucket: */ if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) - goto out; + return 0; ja->bucket_seq[bucket] = le64_to_cpu(j->seq); - enum bch_csum_type csum_type = JSET_CSUM_TYPE(j); struct bch_csum csum; csum_good = jset_csum_good(c, j, &csum); bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); if (!csum_good) { - bch_err_dev_ratelimited(ca, "%s", - (printbuf_reset(&err), - prt_str(&err, "journal "), - bch2_csum_err_msg(&err, csum_type, j->csum, csum), - err.buf)); + /* + * Don't print an error here, we'll print the error + * later if we need this journal entry + */ saw_bad = true; } @@ -1153,6 +1168,7 @@ reread: mutex_lock(&jlist->lock); ret = journal_entry_add(c, ca, (struct journal_ptr) { .csum_good = csum_good, + .csum = csum, .dev = ca->dev_idx, .bucket = bucket, .bucket_offset = offset - @@ -1167,7 +1183,7 @@ reread: case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: break; default: - goto err; + return ret; } next_block: pr_debug("next"); @@ -1176,11 +1192,7 @@ next_block: j = ((void *) j) + (sectors << 9); } -out: - ret = 0; -err: - printbuf_exit(&err); - return ret; + return 0; } static CLOSURE_CALLBACK(bch2_journal_read_device) @@ -1197,7 +1209,7 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) if (!ja->nr) goto out; - ret = journal_read_buf_realloc(&buf, PAGE_SIZE); + ret = journal_read_buf_realloc(c, &buf, PAGE_SIZE); if (ret) goto err; @@ -1229,13 +1241,105 @@ err: goto out; } +noinline_for_stack +static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j) +{ + struct printbuf buf = PRINTBUF; + enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j); + bool have_good = false; + + prt_printf(&buf, "invalid journal checksum(s) at seq %llu ", le64_to_cpu(j->j.seq)); + bch2_journal_datetime_to_text(&buf, &j->j); + prt_newline(&buf); + + darray_for_each(j->ptrs, ptr) + if (!ptr->csum_good) { + bch2_journal_ptr_to_text(&buf, c, ptr); + prt_char(&buf, ' '); + bch2_csum_to_text(&buf, csum_type, ptr->csum); + prt_newline(&buf); + } else { + have_good = true; + } + + prt_printf(&buf, "should be "); + bch2_csum_to_text(&buf, csum_type, j->j.csum); + + if (have_good) + prt_printf(&buf, "\n(had good copy on another device)"); + + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); +} + +noinline_for_stack +static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq) +{ + struct printbuf buf = PRINTBUF; + int ret = 0; + + struct genradix_iter radix_iter; + struct journal_replay *i, **_i, *prev = NULL; + u64 seq = start_seq; + + genradix_for_each(&c->journal_entries, radix_iter, _i) { + i = *_i; + + if (journal_replay_ignore(i)) + continue; + + BUG_ON(seq > le64_to_cpu(i->j.seq)); + + while (seq < le64_to_cpu(i->j.seq)) { + while (seq < le64_to_cpu(i->j.seq) && + bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + if (seq == le64_to_cpu(i->j.seq)) + break; + + u64 missing_start = seq; + + while (seq < le64_to_cpu(i->j.seq) && + !bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + u64 missing_end = seq - 1; + + printbuf_reset(&buf); + prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)", + missing_start, missing_end, + start_seq, end_seq); + + prt_printf(&buf, "\nprev at "); + if (prev) { + bch2_journal_ptrs_to_text(&buf, c, prev); + prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); + } else + prt_printf(&buf, "(none)"); + + prt_printf(&buf, "\nnext at "); + bch2_journal_ptrs_to_text(&buf, c, i); + prt_printf(&buf, ", continue?"); + + fsck_err(c, journal_entries_missing, "%s", buf.buf); + } + + prev = i; + seq++; + } +fsck_err: + printbuf_exit(&buf); + return ret; +} + int bch2_journal_read(struct bch_fs *c, u64 *last_seq, u64 *blacklist_seq, u64 *start_seq) { struct journal_list jlist; - struct journal_replay *i, **_i, *prev = NULL; + struct journal_replay *i, **_i; struct genradix_iter radix_iter; struct printbuf buf = PRINTBUF; bool degraded = false, last_write_torn = false; @@ -1326,14 +1430,24 @@ int bch2_journal_read(struct bch_fs *c, return 0; } - bch_info(c, "journal read done, replaying entries %llu-%llu", - *last_seq, *blacklist_seq - 1); + printbuf_reset(&buf); + prt_printf(&buf, "journal read done, replaying entries %llu-%llu", + *last_seq, *blacklist_seq - 1); - if (*start_seq != *blacklist_seq) - bch_info(c, "dropped unflushed entries %llu-%llu", - *blacklist_seq, *start_seq - 1); + /* + * Drop blacklisted entries and entries older than last_seq (or start of + * journal rewind: + */ + u64 drop_before = *last_seq; + if (c->opts.journal_rewind) { + drop_before = min(drop_before, c->opts.journal_rewind); + prt_printf(&buf, " (rewinding from %llu)", c->opts.journal_rewind); + } - /* Drop blacklisted entries and entries older than last_seq: */ + *last_seq = drop_before; + if (*start_seq != *blacklist_seq) + prt_printf(&buf, " (unflushed %llu-%llu)", *blacklist_seq, *start_seq - 1); + bch_info(c, "%s", buf.buf); genradix_for_each(&c->journal_entries, radix_iter, _i) { i = *_i; @@ -1341,7 +1455,7 @@ int bch2_journal_read(struct bch_fs *c, continue; seq = le64_to_cpu(i->j.seq); - if (seq < *last_seq) { + if (seq < drop_before) { journal_replay_free(c, i, false); continue; } @@ -1354,56 +1468,9 @@ int bch2_journal_read(struct bch_fs *c, } } - /* Check for missing entries: */ - seq = *last_seq; - genradix_for_each(&c->journal_entries, radix_iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - BUG_ON(seq > le64_to_cpu(i->j.seq)); - - while (seq < le64_to_cpu(i->j.seq)) { - u64 missing_start, missing_end; - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; - - while (seq < le64_to_cpu(i->j.seq) && - bch2_journal_seq_is_blacklisted(c, seq, false)) - seq++; - - if (seq == le64_to_cpu(i->j.seq)) - break; - - missing_start = seq; - - while (seq < le64_to_cpu(i->j.seq) && - !bch2_journal_seq_is_blacklisted(c, seq, false)) - seq++; - - if (prev) { - bch2_journal_ptrs_to_text(&buf1, c, prev); - prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); - } else - prt_printf(&buf1, "(none)"); - bch2_journal_ptrs_to_text(&buf2, c, i); - - missing_end = seq - 1; - fsck_err(c, journal_entries_missing, - "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" - "prev at %s\n" - "next at %s, continue?", - missing_start, missing_end, - *last_seq, *blacklist_seq - 1, - buf1.buf, buf2.buf); - - printbuf_exit(&buf1); - printbuf_exit(&buf2); - } - - prev = i; - seq++; - } + ret = bch2_journal_check_for_missing(c, drop_before, *blacklist_seq - 1); + if (ret) + goto err; genradix_for_each(&c->journal_entries, radix_iter, _i) { union bch_replicas_padded replicas = { @@ -1416,15 +1483,15 @@ int bch2_journal_read(struct bch_fs *c, if (journal_replay_ignore(i)) continue; - darray_for_each(i->ptrs, ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - - if (!ptr->csum_good) - bch_err_dev_offset(ca, ptr->sector, - "invalid journal checksum, seq %llu%s", - le64_to_cpu(i->j.seq), - i->csum_good ? " (had good copy on another device)" : ""); - } + /* + * Don't print checksum errors until we know we're going to use + * a given journal entry: + */ + darray_for_each(i->ptrs, ptr) + if (!ptr->csum_good) { + bch2_journal_print_checksum_error(c, i); + break; + } ret = jset_validate(c, bch2_dev_have_ref(c, i->ptrs.data[0].dev), @@ -1467,7 +1534,7 @@ static void journal_advance_devs_to_next_bucket(struct journal *j, { struct bch_fs *c = container_of(j, struct bch_fs, journal); - rcu_read_lock(); + guard(rcu)(); darray_for_each(*devs, i) { struct bch_dev *ca = rcu_dereference(c->devs[*i]); if (!ca) @@ -1489,7 +1556,6 @@ static void journal_advance_devs_to_next_bucket(struct journal *j, ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq); } } - rcu_read_unlock(); } static void __journal_write_alloc(struct journal *j, @@ -1559,7 +1625,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, retry_target: devs = target_rw_devs(c, BCH_DATA_journal, target); - devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); + bch2_dev_alloc_list(c, &j->wp.stripe, &devs, &devs_sorted); retry_alloc: __journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want); @@ -1581,6 +1647,16 @@ retry_alloc: done: BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); +#if 0 + /* + * XXX: we need a way to alert the user when we go degraded for any + * reason + */ + if (*replicas < min(replicas_want, + dev_mask_nr(&c->rw_devs[BCH_DATA_free]))) { + } +#endif + return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; } @@ -1628,7 +1704,7 @@ static CLOSURE_CALLBACK(journal_write_done) : j->noflush_write_time, j->write_start_time); if (!w->devs_written.nr) { - err = -BCH_ERR_journal_write_err; + err = bch_err_throw(c, journal_write_err); } else { bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, w->devs_written); @@ -1640,9 +1716,10 @@ static CLOSURE_CALLBACK(journal_write_done) bch2_log_msg_start(c, &buf); if (err == -BCH_ERR_journal_write_err) - prt_printf(&buf, "unable to write journal to sufficient devices"); + prt_printf(&buf, "unable to write journal to sufficient devices\n"); else - prt_printf(&buf, "journal write error marking replicas: %s", bch2_err_str(err)); + prt_printf(&buf, "journal write error marking replicas: %s\n", + bch2_err_str(err)); bch2_fs_emergency_read_only2(c, &buf); @@ -1690,6 +1767,7 @@ static CLOSURE_CALLBACK(journal_write_done) closure_wake_up(&c->freelist_wait); bch2_reset_alloc_cursors(c); + do_discards = true; } j->seq_ondisk = seq; @@ -2058,7 +2136,7 @@ CLOSURE_CALLBACK(bch2_journal_write) struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); union bch_replicas_padded replicas; - unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_journal]); + unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]); int ret; BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index 12b39fcb4424..6fa82c4050fe 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -9,6 +9,7 @@ void bch2_journal_pos_from_member_info_resume(struct bch_fs *); struct journal_ptr { bool csum_good; + struct bch_csum csum; u8 dev; u32 bucket; u32 bucket_offset; diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 70f36f6bc482..0042d43b8e57 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -83,18 +83,20 @@ static struct journal_space journal_dev_space_available(struct journal *j, struct bch_dev *ca, enum journal_space_from from) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_device *ja = &ca->journal; unsigned sectors, buckets, unwritten; + unsigned bucket_size_aligned = round_down(ca->mi.bucket_size, block_sectors(c)); u64 seq; if (from == journal_space_total) return (struct journal_space) { - .next_entry = ca->mi.bucket_size, - .total = ca->mi.bucket_size * ja->nr, + .next_entry = bucket_size_aligned, + .total = bucket_size_aligned * ja->nr, }; buckets = bch2_journal_dev_buckets_available(j, ja, from); - sectors = ja->sectors_free; + sectors = round_down(ja->sectors_free, block_sectors(c)); /* * We that we don't allocate the space for a journal entry @@ -109,7 +111,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca, continue; /* entry won't fit on this device, skip: */ - if (unwritten > ca->mi.bucket_size) + if (unwritten > bucket_size_aligned) continue; if (unwritten >= sectors) { @@ -119,7 +121,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca, } buckets--; - sectors = ca->mi.bucket_size; + sectors = bucket_size_aligned; } sectors -= unwritten; @@ -127,12 +129,12 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca, if (sectors < ca->mi.bucket_size && buckets) { buckets--; - sectors = ca->mi.bucket_size; + sectors = bucket_size_aligned; } return (struct journal_space) { .next_entry = sectors, - .total = sectors + buckets * ca->mi.bucket_size, + .total = sectors + buckets * bucket_size_aligned, }; } @@ -146,7 +148,6 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); - rcu_read_lock(); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { if (!ca->journal.nr || !ca->mi.durability) @@ -164,12 +165,17 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne array_insert_item(dev_space, nr_devs, pos, space); } - rcu_read_unlock(); if (nr_devs < nr_devs_want) return (struct journal_space) { 0, 0 }; /* + * It's possible for bucket size to be misaligned w.r.t. the filesystem + * block size: + */ + min_bucket_size = round_down(min_bucket_size, block_sectors(c)); + + /* * We sorted largest to smallest, and we want the smallest out of the * @nr_devs_want largest devices: */ @@ -189,8 +195,8 @@ void bch2_journal_space_available(struct journal *j) int ret = 0; lockdep_assert_held(&j->lock); + guard(rcu)(); - rcu_read_lock(); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; @@ -210,7 +216,6 @@ void bch2_journal_space_available(struct journal *j) max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); nr_online++; } - rcu_read_unlock(); j->can_discard = can_discard; @@ -221,15 +226,13 @@ void bch2_journal_space_available(struct journal *j) prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n" "rw journal devs:", nr_online, metadata_replicas_required(c)); - rcu_read_lock(); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) prt_printf(&buf, " %s", ca->name); - rcu_read_unlock(); bch_err(c, "%s", buf.buf); printbuf_exit(&buf); } - ret = -BCH_ERR_insufficient_journal_devices; + ret = bch_err_throw(c, insufficient_journal_devices); goto out; } @@ -243,7 +246,7 @@ void bch2_journal_space_available(struct journal *j) total = j->space[journal_space_total].total; if (!j->space[journal_space_discarded].next_entry) - ret = -BCH_ERR_journal_full; + ret = bch_err_throw(c, journal_full); if ((j->space[journal_space_clean_ondisk].next_entry < j->space[journal_space_clean_ondisk].total) && @@ -256,8 +259,7 @@ void bch2_journal_space_available(struct journal *j) bch2_journal_set_watermark(j); out: j->cur_entry_sectors = !ret - ? round_down(j->space[journal_space_discarded].next_entry, - block_sectors(c)) + ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_error = ret; @@ -625,9 +627,9 @@ static u64 journal_seq_to_flush(struct journal *j) struct bch_fs *c = container_of(j, struct bch_fs, journal); u64 seq_to_flush = 0; - spin_lock(&j->lock); + guard(spinlock)(&j->lock); + guard(rcu)(); - rcu_read_lock(); for_each_rw_member_rcu(c, ca) { struct journal_device *ja = &ca->journal; unsigned nr_buckets, bucket_to_flush; @@ -642,15 +644,11 @@ static u64 journal_seq_to_flush(struct journal *j) seq_to_flush = max(seq_to_flush, ja->bucket_seq[bucket_to_flush]); } - rcu_read_unlock(); /* Also flush if the pin fifo is more than half full */ - seq_to_flush = max_t(s64, seq_to_flush, - (s64) journal_cur_seq(j) - - (j->pin.size >> 1)); - spin_unlock(&j->lock); - - return seq_to_flush; + return max_t(s64, seq_to_flush, + (s64) journal_cur_seq(j) - + (j->pin.size >> 1)); } /** diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c index 62b910f2fb27..0cb9b93f13e7 100644 --- a/fs/bcachefs/journal_sb.c +++ b/fs/bcachefs/journal_sb.c @@ -210,7 +210,7 @@ int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca, j = bch2_sb_field_resize(&ca->disk_sb, journal_v2, (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64)); if (!j) - return -BCH_ERR_ENOSPC_sb_journal; + return bch_err_throw(c, ENOSPC_sb_journal); bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index c5a7d800a0f5..af4fe416d9ec 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -78,7 +78,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, sb_blacklist_u64s(nr + 1)); if (!bl) { - ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist; + ret = bch_err_throw(c, ENOSPC_sb_journal_seq_blacklist); goto out; } @@ -152,7 +152,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL); if (!t) - return -BCH_ERR_ENOMEM_blacklist_table_init; + return bch_err_throw(c, ENOMEM_blacklist_table_init); t->nr = nr; diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index 2f63fc6d456f..57b5b3263b08 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -145,13 +145,11 @@ static u64 bkey_lru_type_idx(struct bch_fs *c, case BCH_LRU_fragmentation: { a = bch2_alloc_to_v4(k, &a_convert); - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode); - u64 idx = ca + return ca ? alloc_lru_idx_fragmentation(*a, ca) : 0; - rcu_read_unlock(); - return idx; } case BCH_LRU_stripes: return k.k->type == KEY_TYPE_stripe diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index bb7a92270c09..f296cce95338 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -35,7 +35,7 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, nr_good = bch2_bkey_durability(c, k.s_c); if ((!nr_good && !(flags & lost)) || (nr_good < replicas && !(flags & degraded))) - return -BCH_ERR_remove_would_lose_data; + return bch_err_throw(c, remove_would_lose_data); return 0; } @@ -156,7 +156,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, /* don't handle this yet: */ if (flags & BCH_FORCE_IF_METADATA_LOST) - return -BCH_ERR_remove_with_metadata_missing_unimplemented; + return bch_err_throw(c, remove_with_metadata_missing_unimplemented); trans = bch2_trans_get(c); bch2_bkey_buf_init(&k); diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 79f4722621d5..eec591e947bd 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -38,30 +38,74 @@ const char * const bch2_data_ops_strs[] = { NULL }; -static void trace_io_move2(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) +struct evacuate_bucket_arg { + struct bpos bucket; + int gen; + struct data_update_opts data_opts; +}; + +static bool evacuate_bucket_pred(struct bch_fs *, void *, + enum btree_id, struct bkey_s_c, + struct bch_io_opts *, + struct data_update_opts *); + +static noinline void +trace_io_move2(struct bch_fs *c, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { - if (trace_io_move_enabled()) { - struct printbuf buf = PRINTBUF; + struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); - trace_io_move(c, buf.buf); - printbuf_exit(&buf); - } + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); + trace_io_move(c, buf.buf); + printbuf_exit(&buf); } -static void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) +static noinline void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) { - if (trace_io_move_read_enabled()) { - struct printbuf buf = PRINTBUF; + struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, k); - trace_io_move_read(c, buf.buf); - printbuf_exit(&buf); + bch2_bkey_val_to_text(&buf, c, k); + trace_io_move_read(c, buf.buf); + printbuf_exit(&buf); +} + +static noinline void +trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts, + move_pred_fn pred, void *_arg, bool p) +{ + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "%ps: %u", pred, p); + + if (pred == evacuate_bucket_pred) { + struct evacuate_bucket_arg *arg = _arg; + prt_printf(&buf, " gen=%u", arg->gen); } + + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); + trace_io_move_pred(c, buf.buf); + printbuf_exit(&buf); +} + +static noinline void +trace_io_move_evacuate_bucket2(struct bch_fs *c, struct bpos bucket, int gen) +{ + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "bucket: "); + bch2_bpos_to_text(&buf, bucket); + prt_printf(&buf, " gen: %i\n", gen); + + trace_io_move_evacuate_bucket(c, buf.buf); + printbuf_exit(&buf); } struct moving_io { @@ -298,7 +342,8 @@ int bch2_move_extent(struct moving_context *ctxt, struct bch_fs *c = trans->c; int ret = -ENOMEM; - trace_io_move2(c, k, &io_opts, &data_opts); + if (trace_io_move_enabled()) + trace_io_move2(c, k, &io_opts, &data_opts); this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); if (ctxt->stats) @@ -314,16 +359,14 @@ int bch2_move_extent(struct moving_context *ctxt, return 0; } - /* - * Before memory allocations & taking nocow locks in - * bch2_data_update_init(): - */ - bch2_trans_unlock(trans); - - struct moving_io *io = kzalloc(sizeof(struct moving_io), GFP_KERNEL); + struct moving_io *io = allocate_dropping_locks(trans, ret, + kzalloc(sizeof(struct moving_io), _gfp)); if (!io) goto err; + if (ret) + goto err_free; + INIT_LIST_HEAD(&io->io_list); io->write.ctxt = ctxt; io->read_sectors = k.k->size; @@ -343,6 +386,8 @@ int bch2_move_extent(struct moving_context *ctxt, io->write.op.c = c; io->write.data_opts = data_opts; + bch2_trans_unlock(trans); + ret = bch2_data_update_bios_init(&io->write, c, &io_opts); if (ret) goto err_free; @@ -364,7 +409,8 @@ int bch2_move_extent(struct moving_context *ctxt, atomic_inc(&io->b->count); } - trace_io_move_read2(c, k); + if (trace_io_move_read_enabled()) + trace_io_move_read2(c, k); mutex_lock(&ctxt->lock); atomic_add(io->read_sectors, &ctxt->read_sectors); @@ -390,9 +436,6 @@ int bch2_move_extent(struct moving_context *ctxt, err_free: kfree(io); err: - if (bch2_err_matches(ret, BCH_ERR_data_update_done)) - return 0; - if (bch2_err_matches(ret, EROFS) || bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; @@ -408,6 +451,9 @@ err: trace_io_move_start_fail(c, buf.buf); printbuf_exit(&buf); } + + if (bch2_err_matches(ret, BCH_ERR_data_update_done)) + return 0; return ret; } @@ -496,6 +542,7 @@ int bch2_move_get_io_opts_one(struct btree_trans *trans, bch2_inode_opts_get(io_opts, c, &inode); } bch2_trans_iter_exit(trans, &inode_iter); + /* seem to be spinning here? */ out: return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k); } @@ -910,7 +957,13 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, } struct data_update_opts data_opts = {}; - if (!pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts)) { + bool p = pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts); + + if (trace_io_move_pred_enabled()) + trace_io_move_pred2(c, k, &io_opts, &data_opts, + pred, arg, p); + + if (!p) { bch2_trans_iter_exit(trans, &iter); goto next; } @@ -918,7 +971,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, if (data_opts.scrub && !bch2_dev_idx_is_online(c, data_opts.read_dev)) { bch2_trans_iter_exit(trans, &iter); - ret = -BCH_ERR_device_offline; + ret = bch_err_throw(c, device_offline); break; } @@ -993,12 +1046,6 @@ int bch2_move_data_phys(struct bch_fs *c, return ret; } -struct evacuate_bucket_arg { - struct bpos bucket; - int gen; - struct data_update_opts data_opts; -}; - static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, @@ -1025,8 +1072,13 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, struct bpos bucket, int gen, struct data_update_opts data_opts) { + struct bch_fs *c = ctxt->trans->c; struct evacuate_bucket_arg arg = { bucket, gen, data_opts, }; + count_event(c, io_move_evacuate_bucket); + if (trace_io_move_evacuate_bucket_enabled()) + trace_io_move_evacuate_bucket2(c, bucket, gen); + return __bch2_move_data_phys(ctxt, bucket_in_flight, bucket.inode, bucket.offset, @@ -1124,7 +1176,7 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, ? c->opts.metadata_replicas : io_opts->data_replicas; - rcu_read_lock(); + guard(rcu)(); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); unsigned i = 0; bkey_for_each_ptr(ptrs, ptr) { @@ -1134,7 +1186,6 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, data_opts->kill_ptrs |= BIT(i); i++; } - rcu_read_unlock(); if (!data_opts->kill_ptrs && (!nr_good || nr_good >= replicas)) @@ -1242,7 +1293,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, struct extent_ptr_decoded p; unsigned i = 0; - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { unsigned d = bch2_extent_ptr_durability(c, &p); @@ -1253,7 +1304,6 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, i++; } - rcu_read_unlock(); return data_opts->kill_ptrs != 0; } diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index e7a2a13554d7..5e6de91a8763 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -28,7 +28,7 @@ #include <linux/wait.h> struct buckets_in_flight { - struct rhashtable table; + struct rhashtable *table; struct move_bucket *first; struct move_bucket *last; size_t nr; @@ -71,7 +71,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, if (ret) return ret; - struct bch_dev *ca = bch2_dev_tryget(c, k.k->p.inode); + struct bch_dev *ca = bch2_dev_bucket_tryget(c, k.k->p); if (!ca) goto out; @@ -98,7 +98,7 @@ out: static void move_bucket_free(struct buckets_in_flight *list, struct move_bucket *b) { - int ret = rhashtable_remove_fast(&list->table, &b->hash, + int ret = rhashtable_remove_fast(list->table, &b->hash, bch_move_bucket_params); BUG_ON(ret); kfree(b); @@ -133,7 +133,7 @@ static void move_buckets_wait(struct moving_context *ctxt, static bool bucket_in_flight(struct buckets_in_flight *list, struct move_bucket_key k) { - return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params); + return rhashtable_lookup_fast(list->table, &k, bch_move_bucket_params); } static int bch2_copygc_get_buckets(struct moving_context *ctxt, @@ -185,7 +185,7 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, goto err; } - ret2 = rhashtable_lookup_insert_fast(&buckets_in_flight->table, &b_i->hash, + ret2 = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash, bch_move_bucket_params); BUG_ON(ret2); @@ -293,11 +293,9 @@ u64 bch2_copygc_wait_amount(struct bch_fs *c) { u64 wait = U64_MAX; - rcu_read_lock(); + guard(rcu)(); for_each_rw_member_rcu(c, ca) wait = min(wait, bch2_copygc_dev_wait_amount(ca)); - rcu_read_unlock(); - return wait; } @@ -321,21 +319,21 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) bch2_printbuf_make_room(out, 4096); - rcu_read_lock(); + struct task_struct *t; out->atomic++; + scoped_guard(rcu) { + prt_printf(out, "Currently calculated wait:\n"); + for_each_rw_member_rcu(c, ca) { + prt_printf(out, " %s:\t", ca->name); + prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca)); + prt_newline(out); + } - prt_printf(out, "Currently calculated wait:\n"); - for_each_rw_member_rcu(c, ca) { - prt_printf(out, " %s:\t", ca->name); - prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca)); - prt_newline(out); + t = rcu_dereference(c->copygc_thread); + if (t) + get_task_struct(t); } - - struct task_struct *t = rcu_dereference(c->copygc_thread); - if (t) - get_task_struct(t); --out->atomic; - rcu_read_unlock(); if (t) { bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); @@ -352,10 +350,13 @@ static int bch2_copygc_thread(void *arg) struct buckets_in_flight buckets = {}; u64 last, wait; - int ret = rhashtable_init(&buckets.table, &bch_move_bucket_params); + buckets.table = kzalloc(sizeof(*buckets.table), GFP_KERNEL); + int ret = !buckets.table + ? -ENOMEM + : rhashtable_init(buckets.table, &bch_move_bucket_params); bch_err_msg(c, ret, "allocating copygc buckets in flight"); if (ret) - return ret; + goto err; set_freezable(); @@ -423,11 +424,12 @@ static int bch2_copygc_thread(void *arg) } move_buckets_wait(&ctxt, &buckets, true); - rhashtable_destroy(&buckets.table); + rhashtable_destroy(buckets.table); bch2_moving_ctxt_exit(&ctxt); bch2_move_stats_exit(&move_stats, c); - - return 0; +err: + kfree(buckets.table); + return ret; } void bch2_copygc_stop(struct bch_fs *c) diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h index b9683d22bab0..f615910d6f98 100644 --- a/fs/bcachefs/movinggc.h +++ b/fs/bcachefs/movinggc.h @@ -7,11 +7,10 @@ void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *); static inline void bch2_copygc_wakeup(struct bch_fs *c) { - rcu_read_lock(); + guard(rcu)(); struct task_struct *p = rcu_dereference(c->copygc_thread); if (p) wake_up_process(p); - rcu_read_unlock(); } void bch2_copygc_stop(struct bch_fs *); diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c index a84b69d6caef..c3f87c59922d 100644 --- a/fs/bcachefs/namei.c +++ b/fs/bcachefs/namei.c @@ -175,6 +175,16 @@ int bch2_create_trans(struct btree_trans *trans, new_inode->bi_dir_offset = dir_offset; } + if (S_ISDIR(mode)) { + ret = bch2_maybe_propagate_has_case_insensitive(trans, + (subvol_inum) { + new_inode->bi_subvol ?: dir.subvol, + new_inode->bi_inum }, + new_inode); + if (ret) + goto err; + } + if (S_ISDIR(mode) && !new_inode->bi_subvol) new_inode->bi_depth = dir_u->bi_depth + 1; @@ -287,7 +297,7 @@ int bch2_unlink_trans(struct btree_trans *trans, } if (deleting_subvol && !inode_u->bi_subvol) { - ret = -BCH_ERR_ENOENT_not_subvol; + ret = bch_err_throw(c, ENOENT_not_subvol); goto err; } @@ -425,8 +435,8 @@ int bch2_rename_trans(struct btree_trans *trans, } ret = bch2_dirent_rename(trans, - src_dir, &src_hash, &src_dir_u->bi_size, - dst_dir, &dst_hash, &dst_dir_u->bi_size, + src_dir, &src_hash, + dst_dir, &dst_hash, src_name, &src_inum, &src_offset, dst_name, &dst_inum, &dst_offset, mode); @@ -615,14 +625,26 @@ static int __bch2_inum_to_path(struct btree_trans *trans, { unsigned orig_pos = path->pos; int ret = 0; + DARRAY(subvol_inum) inums = {}; + + if (!snapshot) { + ret = bch2_subvolume_get_snapshot(trans, subvol, &snapshot); + if (ret) + goto disconnected; + } while (true) { - if (!snapshot) { - ret = bch2_subvolume_get_snapshot(trans, subvol, &snapshot); - if (ret) - goto disconnected; + subvol_inum n = (subvol_inum) { subvol ?: snapshot, inum }; + + if (darray_find_p(inums, i, i->subvol == n.subvol && i->inum == n.inum)) { + prt_str_reversed(path, "(loop)"); + break; } + ret = darray_push(&inums, n); + if (ret) + goto err; + struct bch_inode_unpacked inode; ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, &inode, 0); if (ret) @@ -633,14 +655,16 @@ static int __bch2_inum_to_path(struct btree_trans *trans, break; if (!inode.bi_dir && !inode.bi_dir_offset) { - ret = -BCH_ERR_ENOENT_inode_no_backpointer; + ret = bch_err_throw(trans->c, ENOENT_inode_no_backpointer); goto disconnected; } inum = inode.bi_dir; if (inode.bi_parent_subvol) { subvol = inode.bi_parent_subvol; - snapshot = 0; + ret = bch2_subvolume_get_snapshot(trans, inode.bi_parent_subvol, &snapshot); + if (ret) + goto disconnected; } struct btree_iter d_iter; @@ -652,6 +676,7 @@ static int __bch2_inum_to_path(struct btree_trans *trans, goto disconnected; struct qstr dirent_name = bch2_dirent_get_name(d); + prt_bytes_reversed(path, dirent_name.name, dirent_name.len); prt_char(path, '/'); @@ -667,8 +692,10 @@ out: goto err; reverse_bytes(path->buf + orig_pos, path->pos - orig_pos); + darray_exit(&inums); return 0; err: + darray_exit(&inums); return ret; disconnected: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -707,8 +734,7 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, if (inode_points_to_dirent(target, d)) return 0; - if (!target->bi_dir && - !target->bi_dir_offset) { + if (!bch2_inode_has_backpointer(target)) { fsck_err_on(S_ISDIR(target->bi_mode), trans, inode_dir_missing_backpointer, "directory with missing backpointer\n%s", @@ -733,15 +759,6 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, return __bch2_fsck_write_inode(trans, target); } - if (bch2_inode_should_have_single_bp(target) && - !fsck_err(trans, inode_wrong_backpointer, - "dirent points to inode that does not point back:\n%s", - (bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_newline(&buf), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf))) - goto err; - struct bkey_s_c_dirent bp_dirent = bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents, SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot), @@ -768,6 +785,7 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, ret = __bch2_fsck_write_inode(trans, target); } } else { + printbuf_reset(&buf); bch2_bkey_val_to_text(&buf, c, d.s_c); prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); @@ -857,7 +875,8 @@ int __bch2_check_dirent_target(struct btree_trans *trans, n->v.d_inum = cpu_to_le64(target->bi_inum); } - ret = bch2_trans_update(trans, dirent_iter, &n->k_i, 0); + ret = bch2_trans_update(trans, dirent_iter, &n->k_i, + BTREE_UPDATE_internal_snapshot_node); if (ret) goto err; } diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 2a02606254b3..63f8e254495c 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -234,6 +234,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH_SB_CASEFOLD, false, \ NULL, "Dirent lookups are casefolded") \ + x(casefold_disabled, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Disable casefolding filesystem wide") \ x(inodes_32bit, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ @@ -379,6 +384,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Exit recovery immediately prior to journal replay")\ + x(journal_rewind, u64, \ + OPT_FS|OPT_MOUNT, \ + OPT_UINT(0, U64_MAX), \ + BCH2_NO_SB_OPT, 0, \ + NULL, "Rewind journal") \ x(recovery_passes, u64, \ OPT_FS|OPT_MOUNT, \ OPT_BITFIELD(bch2_recovery_passes), \ diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h index 1ca476adbf6f..8f4e28d440ac 100644 --- a/fs/bcachefs/printbuf.h +++ b/fs/bcachefs/printbuf.h @@ -140,6 +140,14 @@ void bch2_prt_bitflags_vector(struct printbuf *, const char * const[], .size = _size, \ }) +static inline struct printbuf bch2_printbuf_init(void) +{ + return PRINTBUF; +} + +DEFINE_CLASS(printbuf, struct printbuf, + bch2_printbuf_exit(&_T), bch2_printbuf_init(), void) + /* * Returns size remaining of output buffer: */ diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index 3d4755d73af7..f241efb1fb50 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -527,7 +527,7 @@ int bch2_fs_quota_read(struct bch_fs *c) struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); if (!sb_quota) { mutex_unlock(&c->sb_lock); - return -BCH_ERR_ENOSPC_sb_quota; + return bch_err_throw(c, ENOSPC_sb_quota); } bch2_sb_quota_read(c); @@ -572,7 +572,7 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags) mutex_lock(&c->sb_lock); sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); if (!sb_quota) { - ret = -BCH_ERR_ENOSPC_sb_quota; + ret = bch_err_throw(c, ENOSPC_sb_quota); goto unlock; } @@ -726,7 +726,7 @@ static int bch2_quota_set_info(struct super_block *sb, int type, mutex_lock(&c->sb_lock); sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); if (!sb_quota) { - ret = -BCH_ERR_ENOSPC_sb_quota; + ret = bch_err_throw(c, ENOSPC_sb_quota); goto unlock; } diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c index bef2aa1b8bcd..b1438be9d690 100644 --- a/fs/bcachefs/rcu_pending.c +++ b/fs/bcachefs/rcu_pending.c @@ -182,11 +182,6 @@ static inline void kfree_bulk(size_t nr, void ** p) while (nr--) kfree(*p); } - -#define local_irq_save(flags) \ -do { \ - flags = 0; \ -} while (0) #endif static noinline void __process_finished_items(struct rcu_pending *pending, @@ -429,9 +424,15 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN)); - local_irq_save(flags); - p = this_cpu_ptr(pending->p); - spin_lock(&p->lock); + /* We could technically be scheduled before taking the lock and end up + * using a different cpu's rcu_pending_pcpu: that's ok, it needs a lock + * anyways + * + * And we have to do it this way to avoid breaking PREEMPT_RT, which + * redefines how spinlocks work: + */ + p = raw_cpu_ptr(pending->p); + spin_lock_irqsave(&p->lock, flags); rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu); restart: if (may_sleep && @@ -520,9 +521,8 @@ check_expired: goto free_node; } - local_irq_save(flags); - p = this_cpu_ptr(pending->p); - spin_lock(&p->lock); + p = raw_cpu_ptr(pending->p); + spin_lock_irqsave(&p->lock, flags); goto restart; } diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index de1ec9e0caa0..1c345b86b1c0 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -80,13 +80,12 @@ static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, unsigned ptr_bit = 1; unsigned rewrite_ptrs = 0; - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr(ptrs, ptr) { if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target)) rewrite_ptrs |= ptr_bit; ptr_bit <<= 1; } - rcu_read_unlock(); return rewrite_ptrs; } @@ -135,12 +134,11 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) } incompressible: if (opts->background_target) { - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) sectors += p.crc.compressed_size; - rcu_read_unlock(); } return sectors; @@ -445,7 +443,7 @@ static int do_rebalance_extent(struct moving_context *ctxt, if (bch2_err_matches(ret, ENOMEM)) { /* memory allocation failure, wait for some IO to finish */ bch2_move_ctxt_wait_for_io(ctxt); - ret = -BCH_ERR_transaction_restart_nested; + ret = bch_err_throw(c, transaction_restart_nested); } if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -527,7 +525,7 @@ static void rebalance_wait(struct bch_fs *c) r->state = BCH_REBALANCE_waiting; } - bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); + bch2_kthread_io_clock_wait_once(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); } static bool bch2_rebalance_enabled(struct bch_fs *c) @@ -544,6 +542,7 @@ static int do_rebalance(struct moving_context *ctxt) struct bch_fs_rebalance *r = &c->rebalance; struct btree_iter rebalance_work_iter, extent_iter = {}; struct bkey_s_c k; + u32 kick = r->kick; int ret = 0; bch2_trans_begin(trans); @@ -593,7 +592,8 @@ static int do_rebalance(struct moving_context *ctxt) if (!ret && !kthread_should_stop() && !atomic64_read(&r->work_stats.sectors_seen) && - !atomic64_read(&r->scan_stats.sectors_seen)) { + !atomic64_read(&r->scan_stats.sectors_seen) && + kick == r->kick) { bch2_moving_ctxt_flush_all(ctxt); bch2_trans_unlock_long(trans); rebalance_wait(c); @@ -677,11 +677,12 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) } prt_newline(out); - rcu_read_lock(); - struct task_struct *t = rcu_dereference(c->rebalance.thread); - if (t) - get_task_struct(t); - rcu_read_unlock(); + struct task_struct *t; + scoped_guard(rcu) { + t = rcu_dereference(c->rebalance.thread); + if (t) + get_task_struct(t); + } if (t) { bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); @@ -794,7 +795,7 @@ static int check_rebalance_work_one(struct btree_trans *trans, BTREE_ID_extents, POS_MIN, BTREE_ITER_prefetch| BTREE_ITER_all_snapshots); - return -BCH_ERR_transaction_restart_nested; + return bch_err_throw(c, transaction_restart_nested); } if (!extent_k.k && !rebalance_k.k) diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index 5d9214fe1a22..7a565ea7dbfc 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -39,13 +39,11 @@ int bch2_set_fs_needs_rebalance(struct bch_fs *); static inline void bch2_rebalance_wakeup(struct bch_fs *c) { - struct task_struct *p; - - rcu_read_lock(); - p = rcu_dereference(c->rebalance.thread); + c->rebalance.kick++; + guard(rcu)(); + struct task_struct *p = rcu_dereference(c->rebalance.thread); if (p) wake_up_process(p); - rcu_read_unlock(); } void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *); diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h index 33d77286f1d5..c659da149fa3 100644 --- a/fs/bcachefs/rebalance_types.h +++ b/fs/bcachefs/rebalance_types.h @@ -18,6 +18,7 @@ enum bch_rebalance_states { struct bch_fs_rebalance { struct task_struct __rcu *thread; + u32 kick; struct bch_pd_controller pd; enum bch_rebalance_states state; diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 4fca57575565..c94debb12d2f 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -99,9 +99,11 @@ int bch2_btree_lost_data(struct bch_fs *c, goto out; case BTREE_ID_snapshots: ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret; ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; goto out; default: + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret; ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; goto out; } @@ -272,6 +274,28 @@ static int bch2_journal_replay_key(struct btree_trans *trans, struct btree_path *path = btree_iter_path(trans, &iter); if (unlikely(!btree_path_node(path, k->level))) { + struct bch_fs *c = trans->c; + + CLASS(printbuf, buf)(); + prt_str(&buf, "btree="); + bch2_btree_id_to_text(&buf, k->btree_id); + prt_printf(&buf, " level=%u ", k->level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k->k)); + + if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)| + BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) { + bch_err(c, "have key in journal replay for btree depth that does not exist, confused\n%s", + buf.buf); + ret = -EINVAL; + } + + if (!k->allocated) { + bch_notice(c, "dropping key in journal replay for depth that does not exist because we're recovering from scan\n%s", + buf.buf); + k->overwritten = true; + goto out; + } + bch2_trans_iter_exit(trans, &iter); bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, 0, iter_flags); @@ -594,6 +618,7 @@ static int read_btree_roots(struct bch_fs *c) buf.buf, bch2_err_str(ret))) { if (btree_id_is_alloc(i)) r->error = 0; + ret = 0; } } @@ -679,7 +704,7 @@ static bool check_version_upgrade(struct bch_fs *c) ret = true; } - if (new_version > c->sb.version_incompat && + if (new_version > c->sb.version_incompat_allowed && c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) { struct printbuf buf = PRINTBUF; @@ -739,7 +764,24 @@ int bch2_fs_recovery(struct bch_fs *c) ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read) : BCH_RECOVERY_PASS_snapshots_read; c->opts.nochanges = true; + } + + if (c->opts.nochanges) c->opts.read_only = true; + + if (c->opts.journal_rewind) { + bch_info(c, "rewinding journal, fsck required"); + c->opts.fsck = true; + } + + if (go_rw_in_recovery(c)) { + /* + * start workqueues/kworkers early - kthread creation checks for + * pending signals, which is _very_ annoying + */ + ret = bch2_fs_init_rw(c); + if (ret) + goto err; } mutex_lock(&c->sb_lock); @@ -879,7 +921,7 @@ int bch2_fs_recovery(struct bch_fs *c) use_clean: if (!clean) { bch_err(c, "no superblock clean section found"); - ret = -BCH_ERR_fsck_repair_impossible; + ret = bch_err_throw(c, fsck_repair_impossible); goto err; } @@ -950,7 +992,7 @@ use_clean: ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu", journal_seq, last_seq, blacklist_seq - 1) ?: - bch2_fs_journal_start(&c->journal, journal_seq); + bch2_fs_journal_start(&c->journal, last_seq, journal_seq); if (ret) goto err; @@ -1093,13 +1135,6 @@ use_clean: out: bch2_flush_fsck_errs(c); - if (!c->opts.retain_recovery_info) { - bch2_journal_keys_put_initial(c); - bch2_find_btree_nodes_exit(&c->found_btree_nodes); - } - if (!IS_ERR(clean)) - kfree(clean); - if (!ret && test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) && !c->opts.nochanges) { @@ -1108,6 +1143,9 @@ out: } bch_err_fn(c, ret); +final_out: + if (!IS_ERR(clean)) + kfree(clean); return ret; err: fsck_err: @@ -1115,13 +1153,13 @@ fsck_err: struct printbuf buf = PRINTBUF; bch2_log_msg_start(c, &buf); - prt_printf(&buf, "error in recovery: %s", bch2_err_str(ret)); + prt_printf(&buf, "error in recovery: %s\n", bch2_err_str(ret)); bch2_fs_emergency_read_only2(c, &buf); bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); } - return ret; + goto final_out; } int bch2_fs_initialize(struct bch_fs *c) @@ -1170,7 +1208,7 @@ int bch2_fs_initialize(struct bch_fs *c) * journal_res_get() will crash if called before this has * set up the journal.pin FIFO and journal.cur pointer: */ - ret = bch2_fs_journal_start(&c->journal, 1); + ret = bch2_fs_journal_start(&c->journal, 1, 1); if (ret) goto err; diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index dabb29b08ad0..6a039e011064 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -103,20 +103,20 @@ static void bch2_sb_recovery_passes_to_text(struct printbuf *out, prt_tab(out); bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC); + + if (BCH_RECOVERY_PASS_NO_RATELIMIT(i)) + prt_str(out, " (no ratelimit)"); + prt_newline(out); } } -static void bch2_sb_recovery_pass_complete(struct bch_fs *c, - enum bch_recovery_pass pass, - s64 start_time) +static struct recovery_pass_entry *bch2_sb_recovery_pass_entry(struct bch_fs *c, + enum bch_recovery_pass pass) { enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); - s64 end_time = ktime_get_real_seconds(); - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - __clear_bit_le64(stable, ext->recovery_passes_required); + lockdep_assert_held(&c->sb_lock); struct bch_sb_field_recovery_passes *r = bch2_sb_field_get(c->disk_sb.sb, recovery_passes); @@ -127,15 +127,43 @@ static void bch2_sb_recovery_pass_complete(struct bch_fs *c, r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s); if (!r) { bch_err(c, "error creating recovery_passes sb section"); - goto out; + return NULL; } } - r->start[stable].last_run = cpu_to_le64(end_time); - r->start[stable].last_runtime = cpu_to_le32(max(0, end_time - start_time)); -out: + return r->start + stable; +} + +static void bch2_sb_recovery_pass_complete(struct bch_fs *c, + enum bch_recovery_pass pass, + s64 start_time) +{ + guard(mutex)(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + __clear_bit_le64(bch2_recovery_pass_to_stable(pass), + ext->recovery_passes_required); + + struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass); + if (e) { + s64 end_time = ktime_get_real_seconds(); + e->last_run = cpu_to_le64(end_time); + e->last_runtime = cpu_to_le32(max(0, end_time - start_time)); + SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false); + } + bch2_write_super(c); - mutex_unlock(&c->sb_lock); +} + +void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + guard(mutex)(&c->sb_lock); + + struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass); + if (e && !BCH_RECOVERY_PASS_NO_RATELIMIT(e)) { + SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false); + bch2_write_super(c); + } } static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass) @@ -157,6 +185,9 @@ static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recover */ ret = (u64) le32_to_cpu(i->last_runtime) * 100 > ktime_get_real_seconds() - le64_to_cpu(i->last_run); + + if (BCH_RECOVERY_PASS_NO_RATELIMIT(i)) + ret = false; } return ret; @@ -186,11 +217,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c) set_bit(BCH_FS_may_go_rw, &c->flags); - if (keys->nr || - !c->opts.read_only || - !c->sb.clean || - c->opts.recovery_passes || - (c->opts.fsck && !(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))) { + if (go_rw_in_recovery(c)) { if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); bch2_reconstruct_alloc(c); @@ -263,8 +290,13 @@ static bool recovery_pass_needs_set(struct bch_fs *c, enum bch_run_recovery_pass_flags *flags) { struct bch_fs_recovery *r = &c->recovery; - bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); - bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent); + + /* + * Never run scan_for_btree_nodes persistently: check_topology will run + * it if required + */ + if (pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) + *flags |= RUN_RECOVERY_PASS_nopersistent; if ((*flags & RUN_RECOVERY_PASS_ratelimit) && !bch2_recovery_pass_want_ratelimit(c, pass)) @@ -279,6 +311,11 @@ static bool recovery_pass_needs_set(struct bch_fs *c, * Otherwise, we run run_explicit_recovery_pass when we find damage, so * it should run again even if it's already run: */ + bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); + bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent); + bool rewind = in_recovery && + r->curr_pass > pass && + !(r->passes_complete & BIT_ULL(pass)); if (persistent ? !(c->sb.recovery_passes_required & BIT_ULL(pass)) @@ -289,6 +326,9 @@ static bool recovery_pass_needs_set(struct bch_fs *c, (r->passes_ratelimiting & BIT_ULL(pass))) return true; + if (rewind) + return true; + return false; } @@ -315,10 +355,12 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c, goto out; bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); - bool rewind = in_recovery && r->curr_pass > pass; + bool rewind = in_recovery && + r->curr_pass > pass && + !(r->passes_complete & BIT_ULL(pass)); bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit; - if (!(in_recovery && (flags & RUN_RECOVERY_PASS_nopersistent))) { + if (!(flags & RUN_RECOVERY_PASS_nopersistent)) { struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); } @@ -327,7 +369,7 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c, (!in_recovery || r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw)) { prt_printf(out, "need recovery pass %s (%u), but already rw\n", bch2_recovery_passes[pass], pass); - ret = -BCH_ERR_cannot_rewind_recovery; + ret = bch_err_throw(c, cannot_rewind_recovery); goto out; } @@ -347,7 +389,7 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c, if (rewind) { r->next_pass = pass; r->passes_complete &= (1ULL << pass) >> 1; - ret = -BCH_ERR_restart_recovery; + ret = bch_err_throw(c, restart_recovery); } } else { prt_printf(out, "scheduling recovery pass %s (%u)%s\n", @@ -371,10 +413,8 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, { int ret = 0; - scoped_guard(mutex, &c->sb_lock) { - if (!recovery_pass_needs_set(c, pass, &flags)) - return 0; - + if (recovery_pass_needs_set(c, pass, &flags)) { + guard(mutex)(&c->sb_lock); ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); bch2_write_super(c); } @@ -382,9 +422,38 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, return ret; } +/* + * Returns 0 if @pass has run recently, otherwise one of + * -BCH_ERR_restart_recovery + * -BCH_ERR_recovery_pass_will_run + */ +int bch2_require_recovery_pass(struct bch_fs *c, + struct printbuf *out, + enum bch_recovery_pass pass) +{ + if (test_bit(BCH_FS_in_recovery, &c->flags) && + c->recovery.passes_complete & BIT_ULL(pass)) + return 0; + + guard(mutex)(&c->sb_lock); + + if (bch2_recovery_pass_want_ratelimit(c, pass)) + return 0; + + enum bch_run_recovery_pass_flags flags = 0; + int ret = 0; + + if (recovery_pass_needs_set(c, pass, &flags)) { + ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); + bch2_write_super(c); + } + + return ret ?: bch_err_throw(c, recovery_pass_will_run); +} + int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { - enum bch_run_recovery_pass_flags flags = RUN_RECOVERY_PASS_nopersistent; + enum bch_run_recovery_pass_flags flags = 0; if (!recovery_pass_needs_set(c, pass, &flags)) return 0; diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h index dc0d2014ff9b..2117f0ce1922 100644 --- a/fs/bcachefs/recovery_passes.h +++ b/fs/bcachefs/recovery_passes.h @@ -10,11 +10,22 @@ u64 bch2_recovery_passes_from_stable(u64 v); u64 bch2_fsck_recovery_passes(void); +void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *, enum bch_recovery_pass); + enum bch_run_recovery_pass_flags { RUN_RECOVERY_PASS_nopersistent = BIT(0), RUN_RECOVERY_PASS_ratelimit = BIT(1), }; +static inline bool go_rw_in_recovery(struct bch_fs *c) +{ + return (c->journal_keys.nr || + !c->opts.read_only || + !c->sb.clean || + c->opts.recovery_passes || + (c->opts.fsck && !(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))); +} + int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, @@ -24,6 +35,9 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, enum bch_recovery_pass, enum bch_run_recovery_pass_flags); +int bch2_require_recovery_pass(struct bch_fs *, struct printbuf *, + enum bch_recovery_pass); + int bch2_run_online_recovery_passes(struct bch_fs *, u64); int bch2_run_recovery_passes(struct bch_fs *, enum bch_recovery_pass); diff --git a/fs/bcachefs/recovery_passes_format.h b/fs/bcachefs/recovery_passes_format.h index c434eafbca19..b63c20558d3d 100644 --- a/fs/bcachefs/recovery_passes_format.h +++ b/fs/bcachefs/recovery_passes_format.h @@ -87,6 +87,8 @@ struct recovery_pass_entry { __le32 flags; }; +LE32_BITMASK(BCH_RECOVERY_PASS_NO_RATELIMIT, struct recovery_pass_entry, flags, 0, 1) + struct bch_sb_field_recovery_passes { struct bch_sb_field field; struct recovery_pass_entry start[]; diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 3a13dbcab6ba..92b90cfe622b 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -64,6 +64,9 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, REFLINK_P_IDX(p.v), le32_to_cpu(p.v->front_pad), le32_to_cpu(p.v->back_pad)); + + if (REFLINK_P_ERROR(p.v)) + prt_str(out, " error"); } bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) @@ -269,13 +272,12 @@ struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans, return k; if (unlikely(!bkey_extent_is_reflink_data(k.k))) { - unsigned size = min((u64) k.k->size, - REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) - - reflink_offset); - bch2_key_resize(&iter->k, size); + u64 missing_end = min(k.k->p.offset, + REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad)); + BUG_ON(reflink_offset == missing_end); int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset, - k.k->p.offset, should_commit); + missing_end, should_commit); if (ret) { bch2_trans_iter_exit(trans, iter); return bkey_s_c_err(ret); @@ -312,7 +314,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, if (!bkey_refcount_c(k)) { if (!(flags & BTREE_TRIGGER_overwrite)) - ret = -BCH_ERR_missing_indirect_extent; + ret = bch_err_throw(c, missing_indirect_extent); goto next; } @@ -612,7 +614,7 @@ s64 bch2_remap_range(struct bch_fs *c, int ret = 0, ret2 = 0; if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_reflink)) - return -BCH_ERR_erofs_no_writes; + return bch_err_throw(c, erofs_no_writes); bch2_check_set_feature(c, BCH_FEATURE_reflink); @@ -711,7 +713,8 @@ s64 bch2_remap_range(struct bch_fs *c, SET_REFLINK_P_IDX(&dst_p->v, offset); if (reflink_p_may_update_opts_field && - may_change_src_io_path_opts) + may_change_src_io_path_opts && + REFLINK_P_MAY_UPDATE_OPTIONS(src_p.v)) SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true); } else { BUG(); @@ -847,7 +850,7 @@ int bch2_gc_reflink_start(struct bch_fs *c) struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, GFP_KERNEL); if (!r) { - ret = -BCH_ERR_ENOMEM_gc_reflink_start; + ret = bch_err_throw(c, ENOMEM_gc_reflink_start); break; } diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 477ef0997949..8383bd7fdb3f 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -119,7 +119,7 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, return 0; bad: bch2_replicas_entry_to_text(err, r); - return -BCH_ERR_invalid_replicas_entry; + return bch_err_throw(c, invalid_replicas_entry); } void bch2_cpu_replicas_to_text(struct printbuf *out, @@ -311,7 +311,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, !__replicas_has_entry(&c->replicas_gc, new_entry)) { new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry); if (!new_gc.entries) { - ret = -BCH_ERR_ENOMEM_cpu_replicas; + ret = bch_err_throw(c, ENOMEM_cpu_replicas); goto err; } } @@ -319,7 +319,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, if (!__replicas_has_entry(&c->replicas, new_entry)) { new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry); if (!new_r.entries) { - ret = -BCH_ERR_ENOMEM_cpu_replicas; + ret = bch_err_throw(c, ENOMEM_cpu_replicas); goto err; } @@ -422,7 +422,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) if (!c->replicas_gc.entries) { mutex_unlock(&c->sb_lock); bch_err(c, "error allocating c->replicas_gc"); - return -BCH_ERR_ENOMEM_replicas_gc; + return bch_err_throw(c, ENOMEM_replicas_gc); } for_each_cpu_replicas_entry(&c->replicas, e) @@ -458,7 +458,7 @@ retry: new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); if (!new.entries) { bch_err(c, "error allocating c->replicas_gc"); - return -BCH_ERR_ENOMEM_replicas_gc; + return bch_err_throw(c, ENOMEM_replicas_gc); } mutex_lock(&c->sb_lock); @@ -622,7 +622,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0, DIV_ROUND_UP(bytes, sizeof(u64))); if (!sb_r) - return -BCH_ERR_ENOSPC_sb_replicas; + return bch_err_throw(c, ENOSPC_sb_replicas); bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0); @@ -667,7 +667,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, sb_r = bch2_sb_field_resize(&c->disk_sb, replicas, DIV_ROUND_UP(bytes, sizeof(u64))); if (!sb_r) - return -BCH_ERR_ENOSPC_sb_replicas; + return bch_err_throw(c, ENOSPC_sb_replicas); bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas); @@ -819,19 +819,18 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, if (e->data_type == BCH_DATA_cached) continue; - rcu_read_lock(); - for (unsigned i = 0; i < e->nr_devs; i++) { - if (e->devs[i] == BCH_SB_MEMBER_INVALID) { - nr_failed++; - continue; - } + scoped_guard(rcu) + for (unsigned i = 0; i < e->nr_devs; i++) { + if (e->devs[i] == BCH_SB_MEMBER_INVALID) { + nr_failed++; + continue; + } - nr_online += test_bit(e->devs[i], devs.d); + nr_online += test_bit(e->devs[i], devs.d); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]); - nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed; - } - rcu_read_unlock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]); + nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed; + } if (nr_online + nr_failed == e->nr_devs) continue; diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h index 7c0c9c842b4e..b868702a431a 100644 --- a/fs/bcachefs/sb-counters_format.h +++ b/fs/bcachefs/sb-counters_format.h @@ -26,6 +26,7 @@ enum counters_flags { x(io_move_write_fail, 82, TYPE_COUNTER) \ x(io_move_start_fail, 39, TYPE_COUNTER) \ x(io_move_created_rebalance, 83, TYPE_COUNTER) \ + x(io_move_evacuate_bucket, 84, TYPE_COUNTER) \ x(bucket_invalidate, 3, TYPE_COUNTER) \ x(bucket_discard, 4, TYPE_COUNTER) \ x(bucket_discard_fast, 79, TYPE_COUNTER) \ diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 861fce1630f0..1506d05e0665 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -253,6 +253,7 @@ DOWNGRADE_TABLE() static int downgrade_table_extra(struct bch_fs *c, darray_char *table) { + unsigned dst_offset = table->nr; struct bch_sb_field_downgrade_entry *dst = (void *) &darray_top(*table); unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors); int ret = 0; @@ -268,6 +269,9 @@ static int downgrade_table_extra(struct bch_fs *c, darray_char *table) if (ret) return ret; + dst = (void *) &table->data[dst_offset]; + dst->nr_errors = cpu_to_le16(nr_errors + 1); + /* open coded __set_bit_le64, as dst is packed and * dst->recovery_passes is misaligned */ unsigned b = BCH_RECOVERY_PASS_STABLE_check_allocations; @@ -278,7 +282,6 @@ static int downgrade_table_extra(struct bch_fs *c, darray_char *table) break; } - dst->nr_errors = cpu_to_le16(nr_errors); return ret; } @@ -417,7 +420,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c) d = bch2_sb_field_resize(&c->disk_sb, downgrade, sb_u64s); if (!d) { - ret = -BCH_ERR_ENOSPC_sb_downgrade; + ret = bch_err_throw(c, ENOSPC_sb_downgrade); goto out; } diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c index 013a96883b4e..48853efdc105 100644 --- a/fs/bcachefs/sb-errors.c +++ b/fs/bcachefs/sb-errors.c @@ -78,6 +78,28 @@ const struct bch_sb_field_ops bch_sb_field_ops_errors = { .to_text = bch2_sb_errors_to_text, }; +void bch2_fs_errors_to_text(struct printbuf *out, struct bch_fs *c) +{ + if (out->nr_tabstops < 1) + printbuf_tabstop_push(out, 48); + if (out->nr_tabstops < 2) + printbuf_tabstop_push(out, 8); + if (out->nr_tabstops < 3) + printbuf_tabstop_push(out, 16); + + guard(mutex)(&c->fsck_error_counts_lock); + + bch_sb_errors_cpu *e = &c->fsck_error_counts; + darray_for_each(*e, i) { + bch2_sb_error_id_to_text(out, i->id); + prt_tab(out); + prt_u64(out, i->nr); + prt_tab(out); + bch2_prt_datetime(out, i->last_error_time); + prt_newline(out); + } +} + void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err) { bch_sb_errors_cpu *e = &c->fsck_error_counts; diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h index b2357b8e6107..e86267264692 100644 --- a/fs/bcachefs/sb-errors.h +++ b/fs/bcachefs/sb-errors.h @@ -7,6 +7,7 @@ extern const char * const bch2_sb_error_strs[]; void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id); +void bch2_fs_errors_to_text(struct printbuf *, struct bch_fs *); extern const struct bch_sb_field_ops bch_sb_field_ops_errors; diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 0bfb151da9cf..d154b7651d28 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -3,9 +3,10 @@ #define _BCACHEFS_SB_ERRORS_FORMAT_H enum bch_fsck_flags { - FSCK_CAN_FIX = 1 << 0, - FSCK_CAN_IGNORE = 1 << 1, - FSCK_AUTOFIX = 1 << 2, + FSCK_CAN_FIX = BIT(0), + FSCK_CAN_IGNORE = BIT(1), + FSCK_AUTOFIX = BIT(2), + FSCK_ERR_NO_LOG = BIT(3), }; #define BCH_SB_ERRS() \ @@ -134,7 +135,7 @@ enum bch_fsck_flags { x(bucket_gens_to_invalid_buckets, 121, FSCK_AUTOFIX) \ x(bucket_gens_nonzero_for_invalid_buckets, 122, FSCK_AUTOFIX) \ x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \ - x(need_discard_freespace_key_bad, 124, 0) \ + x(need_discard_freespace_key_bad, 124, FSCK_AUTOFIX) \ x(discarding_bucket_not_in_need_discard_btree, 291, 0) \ x(backpointer_bucket_offset_wrong, 125, 0) \ x(backpointer_level_bad, 294, 0) \ @@ -165,7 +166,7 @@ enum bch_fsck_flags { x(ptr_to_missing_replicas_entry, 149, FSCK_AUTOFIX) \ x(ptr_to_missing_stripe, 150, 0) \ x(ptr_to_incorrect_stripe, 151, 0) \ - x(ptr_gen_newer_than_bucket_gen, 152, 0) \ + x(ptr_gen_newer_than_bucket_gen, 152, FSCK_AUTOFIX) \ x(ptr_too_stale, 153, 0) \ x(stale_dirty_ptr, 154, FSCK_AUTOFIX) \ x(ptr_bucket_data_type_mismatch, 155, 0) \ @@ -217,7 +218,7 @@ enum bch_fsck_flags { x(inode_str_hash_invalid, 194, 0) \ x(inode_v3_fields_start_bad, 195, 0) \ x(inode_snapshot_mismatch, 196, 0) \ - x(snapshot_key_missing_inode_snapshot, 314, 0) \ + x(snapshot_key_missing_inode_snapshot, 314, FSCK_AUTOFIX) \ x(inode_unlinked_but_clean, 197, 0) \ x(inode_unlinked_but_nlink_nonzero, 198, 0) \ x(inode_unlinked_and_not_open, 281, 0) \ @@ -232,10 +233,11 @@ enum bch_fsck_flags { x(inode_dir_multiple_links, 206, FSCK_AUTOFIX) \ x(inode_dir_missing_backpointer, 284, FSCK_AUTOFIX) \ x(inode_dir_unlinked_but_not_empty, 286, FSCK_AUTOFIX) \ + x(inode_dir_has_nonzero_i_size, 319, FSCK_AUTOFIX) \ x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \ x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \ x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ - x(inode_has_child_snapshots_wrong, 287, 0) \ + x(inode_has_child_snapshots_wrong, 287, FSCK_AUTOFIX) \ x(inode_unreachable, 210, FSCK_AUTOFIX) \ x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \ x(inode_i_sectors_underflow, 312, FSCK_AUTOFIX) \ @@ -243,26 +245,27 @@ enum bch_fsck_flags { x(inode_parent_has_case_insensitive_not_set, 317, FSCK_AUTOFIX) \ x(vfs_inode_i_blocks_underflow, 311, FSCK_AUTOFIX) \ x(vfs_inode_i_blocks_not_zero_at_truncate, 313, FSCK_AUTOFIX) \ + x(vfs_bad_inode_rm, 320, 0) \ x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \ x(deleted_inode_has_child_snapshots, 288, FSCK_AUTOFIX) \ x(extent_overlapping, 215, 0) \ - x(key_in_missing_inode, 216, 0) \ + x(key_in_missing_inode, 216, FSCK_AUTOFIX) \ x(key_in_wrong_inode_type, 217, 0) \ - x(extent_past_end_of_inode, 218, 0) \ + x(extent_past_end_of_inode, 218, FSCK_AUTOFIX) \ x(dirent_empty_name, 219, 0) \ x(dirent_val_too_big, 220, 0) \ x(dirent_name_too_long, 221, 0) \ x(dirent_name_embedded_nul, 222, 0) \ x(dirent_name_dot_or_dotdot, 223, 0) \ x(dirent_name_has_slash, 224, 0) \ - x(dirent_d_type_wrong, 225, 0) \ + x(dirent_d_type_wrong, 225, FSCK_AUTOFIX) \ x(inode_bi_parent_wrong, 226, 0) \ x(dirent_in_missing_dir_inode, 227, 0) \ x(dirent_in_non_dir_inode, 228, 0) \ - x(dirent_to_missing_inode, 229, 0) \ + x(dirent_to_missing_inode, 229, FSCK_AUTOFIX) \ x(dirent_to_overwritten_inode, 302, 0) \ x(dirent_to_missing_subvol, 230, 0) \ x(dirent_to_itself, 231, 0) \ @@ -277,8 +280,8 @@ enum bch_fsck_flags { x(root_dir_missing, 239, 0) \ x(root_inode_not_dir, 240, 0) \ x(dir_loop, 241, 0) \ - x(hash_table_key_duplicate, 242, 0) \ - x(hash_table_key_wrong_offset, 243, 0) \ + x(hash_table_key_duplicate, 242, FSCK_AUTOFIX) \ + x(hash_table_key_wrong_offset, 243, FSCK_AUTOFIX) \ x(unlinked_inode_not_on_deleted_list, 244, FSCK_AUTOFIX) \ x(reflink_p_front_pad_bad, 245, 0) \ x(journal_entry_dup_same_device, 246, 0) \ @@ -298,7 +301,7 @@ enum bch_fsck_flags { x(btree_node_bkey_bad_u64s, 260, 0) \ x(btree_node_topology_empty_interior_node, 261, 0) \ x(btree_ptr_v2_min_key_bad, 262, 0) \ - x(btree_root_unreadable_and_scan_found_nothing, 263, FSCK_AUTOFIX) \ + x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \ x(snapshot_node_missing, 264, FSCK_AUTOFIX) \ x(dup_backpointer_to_bad_csum_extent, 265, 0) \ x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \ @@ -311,7 +314,7 @@ enum bch_fsck_flags { x(accounting_mismatch, 272, FSCK_AUTOFIX) \ x(accounting_replicas_not_marked, 273, 0) \ x(accounting_to_invalid_device, 289, 0) \ - x(invalid_btree_id, 274, 0) \ + x(invalid_btree_id, 274, FSCK_AUTOFIX) \ x(alloc_key_io_time_bad, 275, 0) \ x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \ x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \ @@ -328,7 +331,7 @@ enum bch_fsck_flags { x(dirent_stray_data_after_cf_name, 305, 0) \ x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \ x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \ - x(MAX, 319, 0) + x(MAX, 321, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 3398906660a5..6245e342a8a8 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -101,7 +101,7 @@ static int sb_members_v2_resize_entries(struct bch_fs *c) mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s); if (!mi) - return -BCH_ERR_ENOSPC_sb_members_v2; + return bch_err_throw(c, ENOSPC_sb_members_v2); for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) { void *dst = (void *) mi->_members + (i * sizeof(struct bch_member)); @@ -325,9 +325,17 @@ static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb, { struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1); struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); - unsigned i; - for (i = 0; i < sb->nr_devices; i++) + if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) { + prt_printf(out, "field ends before start of entries"); + return; + } + + unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / sizeof(mi->_members[0]); + if (nr != sb->nr_devices) + prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices); + + for (unsigned i = 0; i < min(sb->nr_devices, nr); i++) member_to_text(out, members_v1_get(mi, i), gi, sb, i); } @@ -341,9 +349,27 @@ static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb, { struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); - unsigned i; - for (i = 0; i < sb->nr_devices; i++) + if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) { + prt_printf(out, "field ends before start of entries"); + return; + } + + if (!le16_to_cpu(mi->member_bytes)) { + prt_printf(out, "member_bytes 0"); + return; + } + + unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / le16_to_cpu(mi->member_bytes); + if (nr != sb->nr_devices) + prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices); + + /* + * We call to_text() on superblock sections that haven't passed + * validate, so we can't trust sb->nr_devices. + */ + + for (unsigned i = 0; i < min(sb->nr_devices, nr); i++) member_to_text(out, members_v2_get(mi, i), gi, sb, i); } @@ -378,14 +404,13 @@ void bch2_sb_members_from_cpu(struct bch_fs *c) { struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - rcu_read_lock(); + guard(rcu)(); for_each_member_device_rcu(c, ca, NULL) { struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx); for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++) m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e])); } - rcu_read_unlock(); } void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) @@ -443,20 +468,14 @@ void bch2_dev_errors_reset(struct bch_dev *ca) bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k) { - bool ret = true; - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ca) - continue; - - if (!bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) { - ret = false; - break; - } + if (ca && + !bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) + return false; } - rcu_read_unlock(); - return ret; + return true; } static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev, diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 6bd9b86aee5b..8d8a8a857648 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -28,12 +28,9 @@ static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned); static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev) { - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu(c, dev); - bool ret = ca && bch2_dev_is_online(ca); - rcu_read_unlock(); - - return ret; + return ca && bch2_dev_is_online(ca); } static inline bool bch2_dev_is_healthy(struct bch_dev *ca) @@ -142,12 +139,10 @@ static inline void bch2_dev_put(struct bch_dev *ca) static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca) { - rcu_read_lock(); + guard(rcu)(); bch2_dev_put(ca); if ((ca = __bch2_next_dev(c, ca, NULL))) bch2_dev_get(ca); - rcu_read_unlock(); - return ca; } @@ -166,7 +161,7 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, unsigned state_mask, int rw, unsigned ref_idx) { - rcu_read_lock(); + guard(rcu)(); if (ca) enumerated_ref_put(&ca->io_ref[rw], ref_idx); @@ -174,7 +169,6 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, (!((1 << ca->mi.state) & state_mask) || !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx))) ; - rcu_read_unlock(); return ca; } @@ -239,11 +233,10 @@ static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev) static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev) { - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev); if (ca) bch2_dev_get(ca); - rcu_read_unlock(); return ca; } @@ -299,19 +292,16 @@ static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, { might_sleep(); - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu(c, dev); - if (ca && !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx)) - ca = NULL; - rcu_read_unlock(); + if (!ca || !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx)) + return NULL; - if (ca && - (ca->mi.state == BCH_MEMBER_STATE_rw || - (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))) + if (ca->mi.state == BCH_MEMBER_STATE_rw || + (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) return ca; - if (ca) - enumerated_ref_put(&ca->io_ref[rw], ref_idx); + enumerated_ref_put(&ca->io_ref[rw], ref_idx); return NULL; } diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c index 7c403427fbdb..538c324f4765 100644 --- a/fs/bcachefs/six.c +++ b/fs/bcachefs/six.c @@ -339,12 +339,9 @@ static inline bool six_owner_running(struct six_lock *lock) * acquiring the lock and setting the owner field. If we're an RT task * that will live-lock because we won't let the owner complete. */ - rcu_read_lock(); + guard(rcu)(); struct task_struct *owner = READ_ONCE(lock->owner); - bool ret = owner ? owner_on_cpu(owner) : !rt_or_dl_task(current); - rcu_read_unlock(); - - return ret; + return owner ? owner_on_cpu(owner) : !rt_or_dl_task(current); } static inline bool six_optimistic_spin(struct six_lock *lock, diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 00d62d1190ef..4c43d2a2c1f5 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -54,7 +54,7 @@ int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, BTREE_ITER_with_updates, snapshot_tree, s); if (bch2_err_matches(ret, ENOENT)) - ret = -BCH_ERR_ENOENT_snapshot_tree; + ret = bch_err_throw(trans->c, ENOENT_snapshot_tree); return ret; } @@ -67,7 +67,7 @@ __bch2_snapshot_tree_create(struct btree_trans *trans) struct bkey_i_snapshot_tree *s_t; if (ret == -BCH_ERR_ENOSPC_btree_slot) - ret = -BCH_ERR_ENOSPC_snapshot_tree; + ret = bch_err_throw(trans->c, ENOSPC_snapshot_tree); if (ret) return ERR_PTR(ret); @@ -105,11 +105,8 @@ static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) { - rcu_read_lock(); - bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor); - rcu_read_unlock(); - - return ret; + guard(rcu)(); + return __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor); } static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) @@ -138,28 +135,25 @@ static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor) bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) { - bool ret; +#ifdef CONFIG_BCACHEFS_DEBUG + u32 orig_id = id; +#endif - rcu_read_lock(); + guard(rcu)(); struct snapshot_table *t = rcu_dereference(c->snapshots); - if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots)) { - ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor); - goto out; - } + if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots)) + return __bch2_snapshot_is_ancestor_early(t, id, ancestor); if (likely(ancestor >= IS_ANCESTOR_BITMAP)) while (id && id < ancestor - IS_ANCESTOR_BITMAP) id = get_ancestor_below(t, id, ancestor); - ret = id && id < ancestor + bool ret = id && id < ancestor ? test_ancestor_bitmap(t, id, ancestor) : id == ancestor; - EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor)); -out: - rcu_read_unlock(); - + EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, orig_id, ancestor)); return ret; } @@ -293,7 +287,7 @@ static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id) mutex_lock(&c->snapshot_table_lock); int ret = snapshot_t_mut(c, id) ? 0 - : -BCH_ERR_ENOMEM_mark_snapshot; + : bch_err_throw(c, ENOMEM_mark_snapshot); mutex_unlock(&c->snapshot_table_lock); return ret; } @@ -312,7 +306,7 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, t = snapshot_t_mut(c, id); if (!t) { - ret = -BCH_ERR_ENOMEM_mark_snapshot; + ret = bch_err_throw(c, ENOMEM_mark_snapshot); goto err; } @@ -412,10 +406,10 @@ static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root, snapshot_id_list *skip) { + guard(rcu)(); u32 id, subvol = 0, s; retry: id = snapshot_root; - rcu_read_lock(); while (id && bch2_snapshot_exists(c, id)) { if (!(skip && snapshot_list_has_id(skip, id))) { s = snapshot_t(c, id)->subvol; @@ -427,7 +421,6 @@ retry: if (id == snapshot_root) break; } - rcu_read_unlock(); if (!subvol && skip) { skip = NULL; @@ -617,18 +610,14 @@ static int snapshot_tree_ptr_good(struct btree_trans *trans, u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id) { - const struct snapshot_t *s; - if (!id) return 0; - rcu_read_lock(); - s = snapshot_t(c, id); - if (s->parent) - id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)); - rcu_read_unlock(); - - return id; + guard(rcu)(); + const struct snapshot_t *s = snapshot_t(c, id); + return s->parent + ? bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)) + : id; } static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s) @@ -882,7 +871,8 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, 0, k, ret) { - if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) { + if (k.k->type == KEY_TYPE_snapshot_tree && + le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) { tree_id = k.k->p.offset; break; } @@ -910,7 +900,8 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) { - if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) { + if (k.k->type == KEY_TYPE_subvolume && + le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) { snapshot->v.subvol = cpu_to_le32(k.k->p.offset); SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true); break; @@ -947,10 +938,7 @@ static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpo static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r) { - darray_for_each(*l, i) - if (snapshot_list_has_id(r, *i)) - return true; - return false; + return darray_find_p(*l, i, snapshot_list_has_id(r, *i)) != NULL; } static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s) @@ -1022,7 +1010,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { if (t->nr > 1) { bch_err(c, "cannot reconstruct snapshot trees with multiple nodes"); - ret = -BCH_ERR_fsck_repair_unimplemented; + ret = bch_err_throw(c, fsck_repair_unimplemented); goto err; } @@ -1061,24 +1049,73 @@ int __bch2_check_key_has_snapshot(struct btree_trans *trans, ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node) ?: 1; - /* - * Snapshot missing: we should have caught this with btree_lost_data and - * kicked off reconstruct_snapshots, so if we end up here we have no - * idea what happened: - */ - if (fsck_err_on(state == SNAPSHOT_ID_empty, - trans, bkey_in_missing_snapshot, - "key in missing snapshot %s, delete?", - (bch2_btree_id_to_text(&buf, iter->btree_id), - prt_char(&buf, ' '), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node) ?: 1; + if (state == SNAPSHOT_ID_empty) { + /* + * Snapshot missing: we should have caught this with btree_lost_data and + * kicked off reconstruct_snapshots, so if we end up here we have no + * idea what happened. + * + * Do not delete unless we know that subvolumes and snapshots + * are consistent: + * + * XXX: + * + * We could be smarter here, and instead of using the generic + * recovery pass ratelimiting, track if there have been any + * changes to the snapshots or inodes btrees since those passes + * last ran. + */ + ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_snapshots) ?: ret; + ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_subvols) ?: ret; + + if (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)) + ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; + + unsigned repair_flags = FSCK_CAN_IGNORE | (!ret ? FSCK_CAN_FIX : 0); + + if (__fsck_err(trans, repair_flags, bkey_in_missing_snapshot, + "key in missing snapshot %s, delete?", + (bch2_btree_id_to_text(&buf, iter->btree_id), + prt_char(&buf, ' '), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node) ?: 1; + } + } fsck_err: printbuf_exit(&buf); return ret; } +int __bch2_get_snapshot_overwrites(struct btree_trans *trans, + enum btree_id btree, struct bpos pos, + snapshot_id_list *s) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_reverse_norestart(trans, iter, btree, bpos_predecessor(pos), + BTREE_ITER_all_snapshots, k, ret) { + if (!bkey_eq(k.k->p, pos)) + break; + + if (!bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) || + snapshot_list_has_ancestor(c, s, k.k->p.snapshot)) + continue; + + ret = snapshot_list_add(c, s, k.k->p.snapshot); + if (ret) + break; + } + bch2_trans_iter_exit(trans, &iter); + if (ret) + darray_exit(s); + + return ret; +} + /* * Mark a snapshot as deleted, for future cleanup: */ @@ -1263,7 +1300,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, goto err; if (!k.k || !k.k->p.offset) { - ret = -BCH_ERR_ENOSPC_snapshot_create; + ret = bch_err_throw(c, ENOSPC_snapshot_create); goto err; } @@ -1399,10 +1436,8 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id) { - darray_for_each(*l, i) - if (i->id == id) - return i->live_child; - return 0; + struct snapshot_interior_delete *i = darray_find_p(*l, i, i->id == id); + return i ? i->live_child : 0; } static unsigned __live_child(struct snapshot_table *t, u32 id, @@ -1434,11 +1469,9 @@ static unsigned live_child(struct bch_fs *c, u32 id) { struct snapshot_delete *d = &c->snapshot_delete; - rcu_read_lock(); - u32 ret = __live_child(rcu_dereference(c->snapshots), id, - &d->delete_leaves, &d->delete_interior); - rcu_read_unlock(); - return ret; + guard(rcu)(); + return __live_child(rcu_dereference(c->snapshots), id, + &d->delete_leaves, &d->delete_interior); } static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id) @@ -1695,7 +1728,7 @@ static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, interior_delete_list *skip) { - rcu_read_lock(); + guard(rcu)(); while (interior_delete_has_id(skip, id)) id = __bch2_snapshot_parent(c, id); @@ -1704,7 +1737,6 @@ static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, id = __bch2_snapshot_parent(c, id); } while (interior_delete_has_id(skip, id)); } - rcu_read_unlock(); return id; } @@ -1870,6 +1902,8 @@ err: d->running = false; mutex_unlock(&d->progress_lock); bch2_trans_put(trans); + + bch2_recovery_pass_set_no_ratelimit(c, BCH_RECOVERY_PASS_check_snapshots); out_unlock: mutex_unlock(&d->lock); if (!bch2_err_matches(ret, EROFS)) @@ -1905,7 +1939,7 @@ void bch2_delete_dead_snapshots_async(struct bch_fs *c) BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); - if (!queue_work(c->write_ref_wq, &c->snapshot_delete.work)) + if (!queue_work(system_long_wq, &c->snapshot_delete.work)) enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots); } diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 382a171f5413..6766bf673ed9 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -46,12 +46,9 @@ static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) { - rcu_read_lock(); + guard(rcu)(); const struct snapshot_t *s = snapshot_t(c, id); - id = s ? s->tree : 0; - rcu_read_unlock(); - - return id; + return s ? s->tree : 0; } static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) @@ -62,11 +59,8 @@ static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) { - rcu_read_lock(); - id = __bch2_snapshot_parent_early(c, id); - rcu_read_unlock(); - - return id; + guard(rcu)(); + return __bch2_snapshot_parent_early(c, id); } static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) @@ -88,20 +82,15 @@ static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) { - rcu_read_lock(); - id = __bch2_snapshot_parent(c, id); - rcu_read_unlock(); - - return id; + guard(rcu)(); + return __bch2_snapshot_parent(c, id); } static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) { - rcu_read_lock(); + guard(rcu)(); while (n--) id = __bch2_snapshot_parent(c, id); - rcu_read_unlock(); - return id; } @@ -110,13 +99,11 @@ u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32); static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) { - u32 parent; + guard(rcu)(); - rcu_read_lock(); + u32 parent; while ((parent = __bch2_snapshot_parent(c, id))) id = parent; - rcu_read_unlock(); - return id; } @@ -128,11 +115,8 @@ static inline enum snapshot_id_state __bch2_snapshot_id_state(struct bch_fs *c, static inline enum snapshot_id_state bch2_snapshot_id_state(struct bch_fs *c, u32 id) { - rcu_read_lock(); - enum snapshot_id_state ret = __bch2_snapshot_id_state(c, id); - rcu_read_unlock(); - - return ret; + guard(rcu)(); + return __bch2_snapshot_id_state(c, id); } static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) @@ -142,12 +126,9 @@ static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) { - rcu_read_lock(); + guard(rcu)(); const struct snapshot_t *s = snapshot_t(c, id); - int ret = s ? s->children[0] : -BCH_ERR_invalid_snapshot_node; - rcu_read_unlock(); - - return ret; + return s ? s->children[0] : -BCH_ERR_invalid_snapshot_node; } static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) @@ -160,13 +141,8 @@ static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent) { - u32 depth; - - rcu_read_lock(); - depth = parent ? snapshot_t(c, parent)->depth + 1 : 0; - rcu_read_unlock(); - - return depth; + guard(rcu)(); + return parent ? snapshot_t(c, parent)->depth + 1 : 0; } bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32); @@ -180,20 +156,14 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) { - rcu_read_lock(); + guard(rcu)(); const struct snapshot_t *t = snapshot_t(c, id); - bool ret = t && (t->children[0]|t->children[1]) != 0; - rcu_read_unlock(); - - return ret; + return t && (t->children[0]|t->children[1]) != 0; } static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) { - darray_for_each(*s, i) - if (*i == id) - return true; - return false; + return darray_find(*s, id) != NULL; } static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id) @@ -258,6 +228,25 @@ static inline int bch2_check_key_has_snapshot(struct btree_trans *trans, : __bch2_check_key_has_snapshot(trans, iter, k); } +int __bch2_get_snapshot_overwrites(struct btree_trans *, + enum btree_id, struct bpos, + snapshot_id_list *); + +/* + * Get a list of snapshot IDs that have overwritten a given key: + */ +static inline int bch2_get_snapshot_overwrites(struct btree_trans *trans, + enum btree_id btree, struct bpos pos, + snapshot_id_list *s) +{ + darray_init(s); + + return bch2_snapshot_has_children(trans->c, pos.snapshot) + ? __bch2_get_snapshot_overwrites(trans, btree, pos, s) + : 0; + +} + int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos); diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c index 0cbf5508a32c..3e9f59226bdf 100644 --- a/fs/bcachefs/str_hash.c +++ b/fs/bcachefs/str_hash.c @@ -31,14 +31,16 @@ static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dir } } -static noinline int fsck_rename_dirent(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_s_c_dirent old) +static int bch2_fsck_rename_dirent(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_s_c_dirent old, + bool *updated_before_k_pos) { + struct bch_fs *c = trans->c; struct qstr old_name = bch2_dirent_get_name(old); - struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); + struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64)); int ret = PTR_ERR_OR_ZERO(new); if (ret) return ret; @@ -47,28 +49,39 @@ static noinline int fsck_rename_dirent(struct btree_trans *trans, dirent_copy_target(new, old); new->k.p = old.k->p; + char *renamed_buf = bch2_trans_kmalloc(trans, old_name.len + 20); + ret = PTR_ERR_OR_ZERO(renamed_buf); + if (ret) + return ret; + for (unsigned i = 0; i < 1000; i++) { - unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", - old_name.len, old_name.name, i); - unsigned u64s = BKEY_U64s + dirent_val_u64s(len, 0); + new->k.u64s = BKEY_U64s_MAX; - if (u64s > U8_MAX) - return -EINVAL; + struct qstr renamed_name = (struct qstr) QSTR_INIT(renamed_buf, + sprintf(renamed_buf, "%.*s.fsck_renamed-%u", + old_name.len, old_name.name, i)); - new->k.u64s = u64s; + ret = bch2_dirent_init_name(c, new, hash_info, &renamed_name, NULL); + if (ret) + return ret; ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, (subvol_inum) { 0, old.k->p.inode }, old.k->p.snapshot, &new->k_i, - BTREE_UPDATE_internal_snapshot_node); - if (!bch2_err_matches(ret, EEXIST)) + BTREE_UPDATE_internal_snapshot_node| + STR_HASH_must_create); + if (ret && !bch2_err_matches(ret, EEXIST)) break; + if (!ret) { + if (bpos_lt(new->k.p, old.k->p)) + *updated_before_k_pos = true; + break; + } } - if (ret) - return ret; - - return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); + ret = ret ?: bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); + bch_err_fn(c, ret); + return ret; } static noinline int hash_pick_winner(struct btree_trans *trans, @@ -186,7 +199,7 @@ int bch2_repair_inode_hash_info(struct btree_trans *trans, #endif bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); - ret = -BCH_ERR_fsck_repair_unimplemented; + ret = bch_err_throw(c, fsck_repair_unimplemented); goto err; } @@ -221,11 +234,115 @@ static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans return ret; } +/* Put a str_hash key in its proper location, checking for duplicates */ +int bch2_str_hash_repair_key(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc *desc, + struct bch_hash_info *hash_info, + struct btree_iter *k_iter, struct bkey_s_c k, + struct btree_iter *dup_iter, struct bkey_s_c dup_k, + bool *updated_before_k_pos) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + bool free_snapshots_seen = false; + int ret = 0; + + if (!s) { + s = bch2_trans_kmalloc(trans, sizeof(*s)); + ret = PTR_ERR_OR_ZERO(s); + if (ret) + goto out; + + s->pos = k_iter->pos; + darray_init(&s->ids); + + ret = bch2_get_snapshot_overwrites(trans, desc->btree_id, k_iter->pos, &s->ids); + if (ret) + goto out; + + free_snapshots_seen = true; + } + + if (!dup_k.k) { + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto out; + + dup_k = bch2_hash_set_or_get_in_snapshot(trans, dup_iter, *desc, hash_info, + (subvol_inum) { 0, new->k.p.inode }, + new->k.p.snapshot, new, + STR_HASH_must_create| + BTREE_ITER_with_updates| + BTREE_UPDATE_internal_snapshot_node); + ret = bkey_err(dup_k); + if (ret) + goto out; + if (dup_k.k) + goto duplicate_entries; + + if (bpos_lt(new->k.p, k.k->p)) + *updated_before_k_pos = true; + + ret = bch2_insert_snapshot_whiteouts(trans, desc->btree_id, + k_iter->pos, new->k.p) ?: + bch2_hash_delete_at(trans, *desc, hash_info, k_iter, + BTREE_ITER_with_updates| + BTREE_UPDATE_internal_snapshot_node) ?: + bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_commit; + } else { +duplicate_entries: + ret = hash_pick_winner(trans, *desc, hash_info, k, dup_k); + if (ret < 0) + goto out; + + if (!fsck_err(trans, hash_table_key_duplicate, + "duplicate hash table keys%s:\n%s", + ret != 2 ? "" : ", both point to valid inodes", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), + prt_newline(&buf), + bch2_bkey_val_to_text(&buf, c, dup_k), + buf.buf))) + goto out; + + switch (ret) { + case 0: + ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); + break; + case 1: + ret = bch2_hash_delete_at(trans, *desc, hash_info, dup_iter, 0); + break; + case 2: + ret = bch2_fsck_rename_dirent(trans, s, *desc, hash_info, + bkey_s_c_to_dirent(k), + updated_before_k_pos) ?: + bch2_hash_delete_at(trans, *desc, hash_info, k_iter, + BTREE_ITER_with_updates); + goto out; + } + + ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: + -BCH_ERR_transaction_restart_commit; + } +out: +fsck_err: + bch2_trans_iter_exit(trans, dup_iter); + printbuf_exit(&buf); + if (free_snapshots_seen) + darray_exit(&s->ids); + return ret; +} + int __bch2_str_hash_check_key(struct btree_trans *trans, struct snapshots_seen *s, const struct bch_hash_desc *desc, struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c hash_k) + struct btree_iter *k_iter, struct bkey_s_c hash_k, + bool *updated_before_k_pos) { struct bch_fs *c = trans->c; struct btree_iter iter = {}; @@ -239,24 +356,31 @@ int __bch2_str_hash_check_key(struct btree_trans *trans, for_each_btree_key_norestart(trans, iter, desc->btree_id, SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), - BTREE_ITER_slots, k, ret) { + BTREE_ITER_slots| + BTREE_ITER_with_updates, k, ret) { if (bkey_eq(k.k->p, hash_k.k->p)) break; if (k.k->type == desc->key_type && - !desc->cmp_bkey(k, hash_k)) - goto duplicate_entries; + !desc->cmp_bkey(k, hash_k)) { + ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, + hash_info) ?: + bch2_str_hash_repair_key(trans, s, desc, hash_info, + k_iter, hash_k, + &iter, k, updated_before_k_pos); + break; + } - if (bkey_deleted(k.k)) { - bch2_trans_iter_exit(trans, &iter); + if (bkey_deleted(k.k)) goto bad_hash; - } } -out: bch2_trans_iter_exit(trans, &iter); +out: +fsck_err: printbuf_exit(&buf); return ret; bad_hash: + bch2_trans_iter_exit(trans, &iter); /* * Before doing any repair, check hash_info itself: */ @@ -265,64 +389,12 @@ bad_hash: goto out; if (fsck_err(trans, hash_table_key_wrong_offset, - "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", - bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k); - if (IS_ERR(new)) - return PTR_ERR(new); - - k = bch2_hash_set_or_get_in_snapshot(trans, &iter, *desc, hash_info, - (subvol_inum) { 0, hash_k.k->p.inode }, - hash_k.k->p.snapshot, new, - STR_HASH_must_create| - BTREE_ITER_with_updates| - BTREE_UPDATE_internal_snapshot_node); - ret = bkey_err(k); - if (ret) - goto out; - if (k.k) - goto duplicate_entries; - - ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; - goto out; - } -fsck_err: - goto out; -duplicate_entries: - ret = hash_pick_winner(trans, *desc, hash_info, hash_k, k); - if (ret < 0) - goto out; - - if (!fsck_err(trans, hash_table_key_duplicate, - "duplicate hash table keys%s:\n%s", - ret != 2 ? "" : ", both point to valid inodes", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, hash_k), - prt_newline(&buf), - bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) - goto out; - - switch (ret) { - case 0: - ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); - break; - case 1: - ret = bch2_hash_delete_at(trans, *desc, hash_info, &iter, 0); - break; - case 2: - ret = fsck_rename_dirent(trans, s, *desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?: - bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); - goto out; - } - - ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: - -BCH_ERR_transaction_restart_nested; + "hash table key at wrong offset: should be at %llu\n%s", + hash, + (bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) + ret = bch2_str_hash_repair_key(trans, s, desc, hash_info, + k_iter, hash_k, + &iter, bkey_s_c_null, + updated_before_k_pos); goto out; } diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 6762b3627e1b..8979ac2d7a3b 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -48,9 +48,7 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) struct bch_hash_info info = { .inum_snapshot = bi->bi_snapshot, .type = INODE_STR_HASH(bi), -#ifdef CONFIG_UNICODE .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL, -#endif .siphash_key = { .k0 = bi->bi_hash_seed } }; @@ -261,6 +259,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, struct bkey_i *insert, enum btree_iter_update_trigger_flags flags) { + struct bch_fs *c = trans->c; struct btree_iter slot = {}; struct bkey_s_c k; bool found = false; @@ -288,7 +287,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, } if (!ret) - ret = -BCH_ERR_ENOSPC_str_hash_create; + ret = bch_err_throw(c, ENOSPC_str_hash_create); out: bch2_trans_iter_exit(trans, &slot); bch2_trans_iter_exit(trans, iter); @@ -300,7 +299,7 @@ not_found: bch2_trans_iter_exit(trans, &slot); return k; } else if (!found && (flags & STR_HASH_must_replace)) { - ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; + ret = bch_err_throw(c, ENOENT_str_hash_set_must_replace); } else { if (!found && slot.path) swap(*iter, slot); @@ -328,7 +327,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans, return ret; if (k.k) { bch2_trans_iter_exit(trans, &iter); - return -BCH_ERR_EEXIST_str_hash_set; + return bch_err_throw(trans->c, EEXIST_str_hash_set); } return 0; @@ -397,17 +396,27 @@ int bch2_hash_delete(struct btree_trans *trans, int bch2_repair_inode_hash_info(struct btree_trans *, struct bch_inode_unpacked *); struct snapshots_seen; +int bch2_str_hash_repair_key(struct btree_trans *, + struct snapshots_seen *, + const struct bch_hash_desc *, + struct bch_hash_info *, + struct btree_iter *, struct bkey_s_c, + struct btree_iter *, struct bkey_s_c, + bool *); + int __bch2_str_hash_check_key(struct btree_trans *, struct snapshots_seen *, const struct bch_hash_desc *, struct bch_hash_info *, - struct btree_iter *, struct bkey_s_c); + struct btree_iter *, struct bkey_s_c, + bool *); static inline int bch2_str_hash_check_key(struct btree_trans *trans, struct snapshots_seen *s, const struct bch_hash_desc *desc, struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c hash_k) + struct btree_iter *k_iter, struct bkey_s_c hash_k, + bool *updated_before_k_pos) { if (hash_k.k->type != desc->key_type) return 0; @@ -415,7 +424,8 @@ static inline int bch2_str_hash_check_key(struct btree_trans *trans, if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset)) return 0; - return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k); + return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k, + updated_before_k_pos); } #endif /* _BCACHEFS_STR_HASH_H */ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 35c9f86a73c1..020587449123 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -130,10 +130,20 @@ static int check_subvol(struct btree_trans *trans, "subvolume %llu points to missing subvolume root %llu:%u", k.k->p.offset, le64_to_cpu(subvol.v->inode), le32_to_cpu(subvol.v->snapshot))) { - ret = bch2_subvolume_delete(trans, iter->pos.offset); - bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); - ret = ret ?: -BCH_ERR_transaction_restart_nested; - goto err; + /* + * Recreate - any contents that are still disconnected + * will then get reattached under lost+found + */ + bch2_inode_init_early(c, &inode); + bch2_inode_init_late(c, &inode, bch2_current_time(c), + 0, 0, S_IFDIR|0700, 0, NULL); + inode.bi_inum = le64_to_cpu(subvol.v->inode); + inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot); + inode.bi_subvol = k.k->p.offset; + inode.bi_parent_subvol = le32_to_cpu(subvol.v->fs_path_parent); + ret = __bch2_fsck_write_inode(trans, &inode); + if (ret) + goto err; } } else { goto err; @@ -141,13 +151,9 @@ static int check_subvol(struct btree_trans *trans, if (!BCH_SUBVOLUME_SNAP(subvol.v)) { u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot)); - u32 snapshot_tree; - struct bch_snapshot_tree st; - - rcu_read_lock(); - snapshot_tree = snapshot_t(c, snapshot_root)->tree; - rcu_read_unlock(); + u32 snapshot_tree = bch2_snapshot_tree(c, snapshot_root); + struct bch_snapshot_tree st; ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, @@ -259,6 +265,13 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent)); prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent)); } + + if (BCH_SUBVOLUME_RO(s.v)) + prt_printf(out, " ro"); + if (BCH_SUBVOLUME_SNAP(s.v)) + prt_printf(out, " snapshot"); + if (BCH_SUBVOLUME_UNLINKED(s.v)) + prt_printf(out, " unlinked"); } static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set) @@ -486,9 +499,12 @@ err: static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) { - return bch2_subvolumes_reparent(trans, subvolid) ?: + int ret = bch2_subvolumes_reparent(trans, subvolid) ?: commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, __bch2_subvolume_delete(trans, subvolid)); + + bch2_recovery_pass_set_no_ratelimit(trans->c, BCH_RECOVERY_PASS_check_subvols); + return ret; } static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) @@ -597,7 +613,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, ret = bch2_bkey_get_empty_slot(trans, &dst_iter, BTREE_ID_subvolumes, POS(0, U32_MAX)); if (ret == -BCH_ERR_ENOSPC_btree_slot) - ret = -BCH_ERR_ENOSPC_subvolume_create; + ret = bch_err_throw(c, ENOSPC_subvolume_create); if (ret) return ret; @@ -703,8 +719,9 @@ static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) return ret; if (!bkey_is_inode(k.k)) { - bch_err(trans->c, "root inode not found"); - ret = -BCH_ERR_ENOENT_inode; + struct bch_fs *c = trans->c; + bch_err(c, "root inode not found"); + ret = bch_err_throw(c, ENOENT_inode); goto err; } diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 6687b9235d3c..6c2e1d647403 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -1112,7 +1112,7 @@ int bch2_write_super(struct bch_fs *c) prt_str(&buf, ")"); bch2_fs_fatal_error(c, ": %s", buf.buf); printbuf_exit(&buf); - ret = -BCH_ERR_sb_not_downgraded; + ret = bch_err_throw(c, sb_not_downgraded); goto out; } @@ -1142,7 +1142,7 @@ int bch2_write_super(struct bch_fs *c) if (c->opts.errors != BCH_ON_ERROR_continue && c->opts.errors != BCH_ON_ERROR_fix_safe) { - ret = -BCH_ERR_erofs_sb_err; + ret = bch_err_throw(c, erofs_sb_err); bch2_fs_fatal_error(c, "%s", buf.buf); } else { bch_err(c, "%s", buf.buf); @@ -1161,7 +1161,7 @@ int bch2_write_super(struct bch_fs *c) ca->disk_sb.seq); bch2_fs_fatal_error(c, "%s", buf.buf); printbuf_exit(&buf); - ret = -BCH_ERR_erofs_sb_err; + ret = bch_err_throw(c, erofs_sb_err); } } @@ -1215,7 +1215,7 @@ int bch2_write_super(struct bch_fs *c) !can_mount_with_written), c, ": Unable to write superblock to sufficient devices (from %ps)", (void *) _RET_IP_)) - ret = -BCH_ERR_erofs_sb_err; + ret = bch_err_throw(c, erofs_sb_err); out: /* Make new options visible after they're persistent: */ bch2_sb_update(c); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 11579b74c640..c46b1053a02c 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -104,7 +104,7 @@ const char * const bch2_dev_write_refs[] = { #undef x static void __bch2_print_str(struct bch_fs *c, const char *prefix, - const char *str, bool nonblocking) + const char *str) { #ifdef __KERNEL__ struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); @@ -114,17 +114,12 @@ static void __bch2_print_str(struct bch_fs *c, const char *prefix, return; } #endif - bch2_print_string_as_lines(KERN_ERR, str, nonblocking); + bch2_print_string_as_lines(KERN_ERR, str); } void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str) { - __bch2_print_str(c, prefix, str, false); -} - -void bch2_print_str_nonblocking(struct bch_fs *c, const char *prefix, const char *str) -{ - __bch2_print_str(c, prefix, str, true); + __bch2_print_str(c, prefix, str); } __printf(2, 0) @@ -215,27 +210,20 @@ static int bch2_dev_alloc(struct bch_fs *, unsigned); static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); static void bch2_dev_io_ref_stop(struct bch_dev *, int); static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); -static int bch2_fs_init_rw(struct bch_fs *); struct bch_fs *bch2_dev_to_fs(dev_t dev) { - struct bch_fs *c; - - mutex_lock(&bch_fs_list_lock); - rcu_read_lock(); + guard(mutex)(&bch_fs_list_lock); + guard(rcu)(); + struct bch_fs *c; list_for_each_entry(c, &bch_fs_list, list) for_each_member_device_rcu(c, ca, NULL) if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) { closure_get(&c->cl); - goto found; + return c; } - c = NULL; -found: - rcu_read_unlock(); - mutex_unlock(&bch_fs_list_lock); - - return c; + return NULL; } static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid) @@ -480,16 +468,16 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) - return -BCH_ERR_erofs_no_alloc_info; + return bch_err_throw(c, erofs_no_alloc_info); if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { bch_err(c, "cannot go rw, unfixed btree errors"); - return -BCH_ERR_erofs_unfixed_errors; + return bch_err_throw(c, erofs_unfixed_errors); } if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { bch_err(c, "cannot go rw, filesystem is an unresized image file"); - return -BCH_ERR_erofs_filesystem_full; + return bch_err_throw(c, erofs_filesystem_full); } if (test_bit(BCH_FS_rw, &c->flags)) @@ -507,13 +495,12 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) clear_bit(BCH_FS_clean_shutdown, &c->flags); - rcu_read_lock(); - for_each_online_member_rcu(c, ca) - if (ca->mi.state == BCH_MEMBER_STATE_rw) { - bch2_dev_allocator_add(c, ca); - enumerated_ref_start(&ca->io_ref[WRITE]); - } - rcu_read_unlock(); + scoped_guard(rcu) + for_each_online_member_rcu(c, ca) + if (ca->mi.state == BCH_MEMBER_STATE_rw) { + bch2_dev_allocator_add(c, ca); + enumerated_ref_start(&ca->io_ref[WRITE]); + } bch2_recalc_capacity(c); @@ -571,13 +558,13 @@ int bch2_fs_read_write(struct bch_fs *c) { if (c->opts.recovery_pass_last && c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay) - return -BCH_ERR_erofs_norecovery; + return bch_err_throw(c, erofs_norecovery); if (c->opts.nochanges) - return -BCH_ERR_erofs_nochanges; + return bch_err_throw(c, erofs_nochanges); if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) - return -BCH_ERR_erofs_no_alloc_info; + return bch_err_throw(c, erofs_no_alloc_info); return __bch2_fs_read_write(c, false); } @@ -762,7 +749,7 @@ static int bch2_fs_online(struct bch_fs *c) if (c->sb.multi_device && __bch2_uuid_to_fs(c->sb.uuid)) { bch_err(c, "filesystem UUID already open"); - return -BCH_ERR_filesystem_uuid_already_open; + return bch_err_throw(c, filesystem_uuid_already_open); } ret = bch2_fs_chardev_init(c); @@ -806,7 +793,7 @@ err: return ret; } -static int bch2_fs_init_rw(struct bch_fs *c) +int bch2_fs_init_rw(struct bch_fs *c) { if (test_bit(BCH_FS_rw_init_done, &c->flags)) return 0; @@ -821,7 +808,7 @@ static int bch2_fs_init_rw(struct bch_fs *c) WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", WQ_FREEZABLE, 0))) - return -BCH_ERR_ENOMEM_fs_other_alloc; + return bch_err_throw(c, ENOMEM_fs_other_alloc); int ret = bch2_fs_btree_interior_update_init(c) ?: bch2_fs_btree_write_buffer_init(c) ?: @@ -1002,7 +989,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, c->opts.btree_node_size) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) { - ret = -BCH_ERR_ENOMEM_fs_other_alloc; + ret = bch_err_throw(c, ENOMEM_fs_other_alloc); goto err; } @@ -1027,21 +1014,29 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, if (ret) goto err; + if (go_rw_in_recovery(c)) { + /* + * start workqueues/kworkers early - kthread creation checks for + * pending signals, which is _very_ annoying + */ + ret = bch2_fs_init_rw(c); + if (ret) + goto err; + } + #ifdef CONFIG_UNICODE - /* Default encoding until we can potentially have more as an option. */ - c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); - if (IS_ERR(c->cf_encoding)) { - printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", - unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); - ret = -EINVAL; - goto err; + if (bch2_fs_casefold_enabled(c)) { + /* Default encoding until we can potentially have more as an option. */ + c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); + if (IS_ERR(c->cf_encoding)) { + printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", + unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); + ret = -EINVAL; + goto err; + } } - bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", - unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); #else if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) { printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n"); @@ -1083,12 +1078,13 @@ noinline_for_stack static void print_mount_opts(struct bch_fs *c) { enum bch_opt_id i; - struct printbuf p = PRINTBUF; - bool first = true; + CLASS(printbuf, p)(); + bch2_log_msg_start(c, &p); prt_str(&p, "starting version "); bch2_version_to_text(&p, c->sb.version); + bool first = true; for (i = 0; i < bch2_opts_nr; i++) { const struct bch_option *opt = &bch2_opt_table[i]; u64 v = bch2_opt_get_by_id(&c->opts, i); @@ -1105,17 +1101,24 @@ static void print_mount_opts(struct bch_fs *c) } if (c->sb.version_incompat_allowed != c->sb.version) { - prt_printf(&p, "\n allowing incompatible features above "); + prt_printf(&p, "\nallowing incompatible features above "); bch2_version_to_text(&p, c->sb.version_incompat_allowed); } if (c->opts.verbose) { - prt_printf(&p, "\n features: "); + prt_printf(&p, "\nfeatures: "); prt_bitflags(&p, bch2_sb_features, c->sb.features); } - bch_info(c, "%s", p.buf); - printbuf_exit(&p); + if (c->sb.multi_device) { + prt_printf(&p, "\nwith devices"); + for_each_online_member(c, ca, BCH_DEV_READ_REF_bch2_online_devs) { + prt_char(&p, ' '); + prt_str(&p, ca->name); + } + } + + bch2_print_str(c, KERN_INFO, p.buf); } static bool bch2_fs_may_start(struct bch_fs *c) @@ -1159,8 +1162,14 @@ int bch2_fs_start(struct bch_fs *c) print_mount_opts(c); + if (c->cf_encoding) + bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", + unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); + if (!bch2_fs_may_start(c)) - return -BCH_ERR_insufficient_devices_to_start; + return bch_err_throw(c, insufficient_devices_to_start); down_write(&c->state_lock); mutex_lock(&c->sb_lock); @@ -1171,7 +1180,7 @@ int bch2_fs_start(struct bch_fs *c) sizeof(struct bch_sb_field_ext) / sizeof(u64))) { mutex_unlock(&c->sb_lock); up_write(&c->state_lock); - ret = -BCH_ERR_ENOSPC_sb; + ret = bch_err_throw(c, ENOSPC_sb); goto err; } @@ -1182,22 +1191,20 @@ int bch2_fs_start(struct bch_fs *c) goto err; } - rcu_read_lock(); - for_each_online_member_rcu(c, ca) - bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = - cpu_to_le64(now); - rcu_read_unlock(); + scoped_guard(rcu) + for_each_online_member_rcu(c, ca) + bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = + cpu_to_le64(now); /* * Dno't write superblock yet: recovery might have to downgrade */ mutex_unlock(&c->sb_lock); - rcu_read_lock(); - for_each_online_member_rcu(c, ca) - if (ca->mi.state == BCH_MEMBER_STATE_rw) - bch2_dev_allocator_add(c, ca); - rcu_read_unlock(); + scoped_guard(rcu) + for_each_online_member_rcu(c, ca) + if (ca->mi.state == BCH_MEMBER_STATE_rw) + bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); up_write(&c->state_lock); @@ -1215,7 +1222,7 @@ int bch2_fs_start(struct bch_fs *c) goto err; if (bch2_fs_init_fault("fs_start")) { - ret = -BCH_ERR_injected_fs_start; + ret = bch_err_throw(c, injected_fs_start); goto err; } @@ -1242,11 +1249,11 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); if (le16_to_cpu(sb->block_size) != block_sectors(c)) - return -BCH_ERR_mismatched_block_size; + return bch_err_throw(c, mismatched_block_size); if (le16_to_cpu(m.bucket_size) < BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) - return -BCH_ERR_bucket_size_too_small; + return bch_err_throw(c, bucket_size_too_small); return 0; } @@ -1557,7 +1564,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) bch2_dev_attach(c, ca, dev_idx); return 0; err: - return -BCH_ERR_ENOMEM_dev_alloc; + return bch_err_throw(c, ENOMEM_dev_alloc); } static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) @@ -1567,13 +1574,13 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) if (bch2_dev_is_online(ca)) { bch_err(ca, "already have device online in slot %u", sb->sb->dev_idx); - return -BCH_ERR_device_already_online; + return bch_err_throw(ca->fs, device_already_online); } if (get_capacity(sb->bdev->bd_disk) < ca->mi.bucket_size * ca->mi.nbuckets) { bch_err(ca, "cannot online: device too small"); - return -BCH_ERR_device_size_too_small; + return bch_err_throw(ca->fs, device_size_too_small); } BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); @@ -1725,7 +1732,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, return 0; if (!bch2_dev_state_allowed(c, ca, new_state, flags)) - return -BCH_ERR_device_state_not_allowed; + return bch_err_throw(c, device_state_not_allowed); if (new_state != BCH_MEMBER_STATE_rw) __bch2_dev_read_only(c, ca); @@ -1778,7 +1785,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { bch_err(ca, "Cannot remove without losing data"); - ret = -BCH_ERR_device_state_not_allowed; + ret = bch_err_throw(c, device_state_not_allowed); goto err; } @@ -1914,7 +1921,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (list_empty(&c->list)) { mutex_lock(&bch_fs_list_lock); if (__bch2_uuid_to_fs(c->sb.uuid)) - ret = -BCH_ERR_filesystem_uuid_already_open; + ret = bch_err_throw(c, filesystem_uuid_already_open); else list_add(&c->list, &bch_fs_list); mutex_unlock(&bch_fs_list_lock); @@ -2001,6 +2008,22 @@ int bch2_dev_add(struct bch_fs *c, const char *path) goto err_late; } + /* + * We just changed the superblock UUID, invalidate cache and send a + * uevent to update /dev/disk/by-uuid + */ + invalidate_bdev(ca->disk_sb.bdev); + + char uuid_str[37]; + snprintf(uuid_str, sizeof(uuid_str), "UUID=%pUb", &c->sb.uuid); + + char *envp[] = { + "CHANGE=uuid", + uuid_str, + NULL, + }; + kobject_uevent_env(&ca->disk_sb.bdev->bd_device.kobj, KOBJ_CHANGE, envp); + up_write(&c->state_lock); out: printbuf_exit(&label); @@ -2101,7 +2124,7 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { bch_err(ca, "Cannot offline required disk"); up_write(&c->state_lock); - return -BCH_ERR_device_state_not_allowed; + return bch_err_throw(c, device_state_not_allowed); } __bch2_dev_offline(c, ca); @@ -2140,7 +2163,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) { bch_err(ca, "New device size too big (%llu greater than max %u)", nbuckets, BCH_MEMBER_NBUCKETS_MAX); - ret = -BCH_ERR_device_size_too_big; + ret = bch_err_throw(c, device_size_too_big); goto err; } @@ -2148,7 +2171,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) get_capacity(ca->disk_sb.bdev->bd_disk) < ca->mi.bucket_size * nbuckets) { bch_err(ca, "New size larger than device"); - ret = -BCH_ERR_device_size_too_small; + ret = bch_err_throw(c, device_size_too_small); goto err; } @@ -2383,7 +2406,7 @@ struct bch_fs *bch2_fs_open(darray_const_str *devices, } if (opts->nochanges && !opts->read_only) { - ret = -BCH_ERR_erofs_nochanges; + ret = bch_err_throw(c, erofs_nochanges); goto err_print; } diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index dc52f06cb2b9..e90bab9afe78 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -46,6 +46,7 @@ void __bch2_fs_stop(struct bch_fs *); void bch2_fs_free(struct bch_fs *); void bch2_fs_stop(struct bch_fs *); +int bch2_fs_init_rw(struct bch_fs *); int bch2_fs_start(struct bch_fs *); struct bch_fs *bch2_fs_open(darray_const_str *, struct bch_opts *); diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 1a55196d69f1..05848375cea2 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -26,6 +26,7 @@ #include "disk_groups.h" #include "ec.h" #include "enumerated_ref.h" +#include "error.h" #include "inode.h" #include "journal.h" #include "journal_reclaim.h" @@ -37,6 +38,7 @@ #include "rebalance.h" #include "recovery_passes.h" #include "replicas.h" +#include "sb-errors.h" #include "super-io.h" #include "tests.h" @@ -143,6 +145,7 @@ do { \ write_attribute(trigger_gc); write_attribute(trigger_discards); write_attribute(trigger_invalidates); +write_attribute(trigger_journal_commit); write_attribute(trigger_journal_flush); write_attribute(trigger_journal_writes); write_attribute(trigger_btree_cache_shrink); @@ -151,6 +154,7 @@ write_attribute(trigger_btree_updates); write_attribute(trigger_freelist_wakeup); write_attribute(trigger_recalc_capacity); write_attribute(trigger_delete_dead_snapshots); +write_attribute(trigger_emergency_read_only); read_attribute(gc_gens_pos); read_attribute(uuid); @@ -172,6 +176,7 @@ read_attribute(btree_write_stats); read_attribute(btree_cache_size); read_attribute(compression_stats); +read_attribute(errors); read_attribute(journal_debug); read_attribute(btree_cache); read_attribute(btree_key_cache); @@ -353,6 +358,9 @@ SHOW(bch2_fs) if (attr == &sysfs_compression_stats) bch2_compression_stats_to_text(out, c); + if (attr == &sysfs_errors) + bch2_fs_errors_to_text(out, c); + if (attr == &sysfs_new_stripes) bch2_new_stripes_to_text(out, c); @@ -428,6 +436,9 @@ STORE(bch2_fs) if (attr == &sysfs_trigger_invalidates) bch2_do_invalidates(c); + if (attr == &sysfs_trigger_journal_commit) + bch2_journal_flush(&c->journal); + if (attr == &sysfs_trigger_journal_flush) { bch2_journal_flush_all_pins(&c->journal); bch2_journal_meta(&c->journal); @@ -448,6 +459,16 @@ STORE(bch2_fs) if (attr == &sysfs_trigger_delete_dead_snapshots) __bch2_delete_dead_snapshots(c); + if (attr == &sysfs_trigger_emergency_read_only) { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "shutdown by sysfs\n"); + bch2_fs_emergency_read_only2(c, &buf); + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + } + #ifdef CONFIG_BCACHEFS_TESTS if (attr == &sysfs_perf_test) { char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; @@ -483,6 +504,7 @@ struct attribute *bch2_fs_files[] = { &sysfs_recovery_status, &sysfs_compression_stats, + &sysfs_errors, #ifdef CONFIG_BCACHEFS_TESTS &sysfs_perf_test, @@ -571,6 +593,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_gc, &sysfs_trigger_discards, &sysfs_trigger_invalidates, + &sysfs_trigger_journal_commit, &sysfs_trigger_journal_flush, &sysfs_trigger_journal_writes, &sysfs_trigger_btree_cache_shrink, @@ -579,6 +602,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_freelist_wakeup, &sysfs_trigger_recalc_capacity, &sysfs_trigger_delete_dead_snapshots, + &sysfs_trigger_emergency_read_only, &sysfs_gc_gens_pos, diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 8cb5b40704fd..9c5a9c551f03 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -199,6 +199,50 @@ DECLARE_EVENT_CLASS(bio, (unsigned long long)__entry->sector, __entry->nr_sector) ); +/* errors */ + +TRACE_EVENT(error_throw, + TP_PROTO(struct bch_fs *c, int bch_err, unsigned long ip), + TP_ARGS(c, bch_err, ip), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(int, err ) + __array(char, err_str, 32 ) + __array(char, ip, 32 ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->err = bch_err; + strscpy(__entry->err_str, bch2_err_str(bch_err), sizeof(__entry->err_str)); + snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); + ), + + TP_printk("%d,%d %s ret %s", MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ip, __entry->err_str) +); + +TRACE_EVENT(error_downcast, + TP_PROTO(int bch_err, int std_err, unsigned long ip), + TP_ARGS(bch_err, std_err, ip), + + TP_STRUCT__entry( + __array(char, bch_err, 32 ) + __array(char, std_err, 32 ) + __array(char, ip, 32 ) + ), + + TP_fast_assign( + strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err)); + strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err)); + snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); + ), + + TP_printk("%s ret %s -> %s %s", __entry->ip, + __entry->bch_err, __entry->std_err, __entry->ip) +); + /* disk_accounting.c */ TRACE_EVENT(accounting_mem_insert, @@ -1036,34 +1080,14 @@ TRACE_EVENT(trans_blocked_journal_reclaim, __entry->must_wait) ); -TRACE_EVENT(trans_restart_journal_preres_get, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - unsigned flags), - TP_ARGS(trans, caller_ip, flags), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(unsigned, flags ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->flags = flags; - ), - - TP_printk("%s %pS %x", __entry->trans_fn, - (void *) __entry->caller_ip, - __entry->flags) -); - +#if 0 +/* todo: bring back dynamic fault injection */ DEFINE_EVENT(transaction_event, trans_restart_fault_inject, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), TP_ARGS(trans, caller_ip) ); +#endif DEFINE_EVENT(transaction_event, trans_traverse_all, TP_PROTO(struct btree_trans *trans, @@ -1151,19 +1175,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill, TP_ARGS(trans, caller_ip, path) ); -DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - -DEFINE_EVENT(transaction_event, trans_restart_key_cache_upgrade, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); - DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, @@ -1185,13 +1196,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent, TP_ARGS(trans, caller_ip, path) ); -DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path), - TP_ARGS(trans, caller_ip, path) -); - DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, @@ -1250,44 +1254,6 @@ TRACE_EVENT(trans_restart_mem_realloced, __entry->bytes) ); -TRACE_EVENT(trans_restart_key_cache_key_realloced, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path, - unsigned old_u64s, - unsigned new_u64s), - TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(enum btree_id, btree_id ) - TRACE_BPOS_entries(pos) - __field(u32, old_u64s ) - __field(u32, new_u64s ) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - - __entry->btree_id = path->btree_id; - TRACE_BPOS_assign(pos, path->pos); - __entry->old_u64s = old_u64s; - __entry->new_u64s = new_u64s; - ), - - TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u", - __entry->trans_fn, - (void *) __entry->caller_ip, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->old_u64s, - __entry->new_u64s) -); - DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), @@ -1431,28 +1397,44 @@ DEFINE_EVENT(fs_str, data_update, TP_ARGS(c, str) ); +DEFINE_EVENT(fs_str, io_move_pred, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + DEFINE_EVENT(fs_str, io_move_created_rebalance, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); -TRACE_EVENT(error_downcast, - TP_PROTO(int bch_err, int std_err, unsigned long ip), - TP_ARGS(bch_err, std_err, ip), +DEFINE_EVENT(fs_str, io_move_evacuate_bucket, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); - TP_STRUCT__entry( - __array(char, bch_err, 32 ) - __array(char, std_err, 32 ) - __array(char, ip, 32 ) - ), +DEFINE_EVENT(fs_str, extent_trim_atomic, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); - TP_fast_assign( - strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err)); - strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err)); - snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); - ), +DEFINE_EVENT(fs_str, btree_iter_peek_slot, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + +DEFINE_EVENT(fs_str, __btree_iter_peek, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); - TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip) +DEFINE_EVENT(fs_str, btree_iter_peek_max, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + +DEFINE_EVENT(fs_str, btree_iter_peek_prev_min, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); #ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS @@ -1867,21 +1849,6 @@ TRACE_EVENT(btree_path_free, __entry->dup_locked) ); -TRACE_EVENT(btree_path_free_trans_begin, - TP_PROTO(btree_path_idx_t path), - TP_ARGS(path), - - TP_STRUCT__entry( - __field(btree_path_idx_t, idx ) - ), - - TP_fast_assign( - __entry->idx = path; - ), - - TP_printk(" path %3u", __entry->idx) -); - #else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */ #ifndef _TRACE_BCACHEFS_H @@ -1899,7 +1866,6 @@ static inline void trace_btree_path_traverse_start(struct btree_trans *trans, st static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {} static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {} static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {} -static inline void trace_btree_path_free_trans_begin(btree_path_idx_t path) {} #endif #endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */ diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index dc3817f545fa..df9a6071fe18 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -262,8 +262,7 @@ static bool string_is_spaces(const char *str) return true; } -void bch2_print_string_as_lines(const char *prefix, const char *lines, - bool nonblocking) +void bch2_print_string_as_lines(const char *prefix, const char *lines) { bool locked = false; const char *p; @@ -273,12 +272,7 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines, return; } - if (!nonblocking) { - console_lock(); - locked = true; - } else { - locked = console_trylock(); - } + locked = console_trylock(); while (*lines) { p = strchrnul(lines, '\n'); diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 25cf61ebd40c..6488f098d140 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -17,6 +17,7 @@ #include <linux/random.h> #include <linux/ratelimit.h> #include <linux/slab.h> +#include <linux/sort.h> #include <linux/vmalloc.h> #include <linux/workqueue.h> @@ -213,7 +214,7 @@ u64 bch2_read_flag_list(const char *, const char * const[]); void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned); void bch2_prt_u64_base2(struct printbuf *, u64); -void bch2_print_string_as_lines(const char *, const char *, bool); +void bch2_print_string_as_lines(const char *, const char *); typedef DARRAY(unsigned long) bch_stacktrace; int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t); @@ -672,8 +673,6 @@ static inline void percpu_memset(void __percpu *p, int c, size_t bytes) u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); -#define cmp_int(l, r) ((l > r) - (l < r)) - static inline int u8_cmp(u8 l, u8 r) { return cmp_int(l, r); diff --git a/fs/bfs/file.c b/fs/bfs/file.c index fa66a09e496a..d33d6bde992b 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -27,7 +27,7 @@ const struct file_operations bfs_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, - .mmap = generic_file_mmap, + .mmap_prepare = generic_file_mmap_prepare, .splice_read = filemap_splice_read, }; @@ -170,9 +170,10 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to) truncate_pagecache(inode, inode->i_size); } -static int bfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int bfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index a43363d593e5..264fba0d44bd 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -519,7 +519,7 @@ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex, /* Sanity check the number of program headers... */ /* ...and their total size. */ size = sizeof(struct elf_phdr) * elf_ex->e_phnum; - if (size == 0 || size > 65536 || size > ELF_MIN_ALIGN) + if (size == 0 || size > 65536) goto out; elf_phdata = kmalloc(size, GFP_KERNEL); @@ -646,7 +646,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, if (!elf_check_arch(interp_elf_ex) || elf_check_fdpic(interp_elf_ex)) goto out; - if (!interpreter->f_op->mmap) + if (!can_mmap_file(interpreter)) goto out; total_size = total_mapping_size(interp_elf_phdata, @@ -848,7 +848,7 @@ static int load_elf_binary(struct linux_binprm *bprm) goto out; if (elf_check_fdpic(elf_ex)) goto out; - if (!bprm->file->f_op->mmap) + if (!can_mmap_file(bprm->file)) goto out; elf_phdata = load_elf_phdrs(elf_ex, bprm->file); @@ -1450,8 +1450,8 @@ static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset) phdr->p_align = 4; } -static void fill_note(struct memelfnote *note, const char *name, int type, - unsigned int sz, void *data) +static void __fill_note(struct memelfnote *note, const char *name, int type, + unsigned int sz, void *data) { note->name = name; note->type = type; @@ -1459,6 +1459,9 @@ static void fill_note(struct memelfnote *note, const char *name, int type, note->data = data; } +#define fill_note(note, type, sz, data) \ + __fill_note(note, NN_ ## type, NT_ ## type, sz, data) + /* * fill up all the fields in prstatus from the given task struct, except * registers which need to be filled up separately. @@ -1549,14 +1552,14 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm) do i += 2; while (auxv[i - 2] != AT_NULL); - fill_note(note, NN_AUXV, NT_AUXV, i * sizeof(elf_addr_t), auxv); + fill_note(note, AUXV, i * sizeof(elf_addr_t), auxv); } static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, const kernel_siginfo_t *siginfo) { copy_siginfo_to_external(csigdata, siginfo); - fill_note(note, NN_SIGINFO, NT_SIGINFO, sizeof(*csigdata), csigdata); + fill_note(note, SIGINFO, sizeof(*csigdata), csigdata); } /* @@ -1652,7 +1655,7 @@ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm } size = name_curpos - (char *)data; - fill_note(note, NN_FILE, NT_FILE, size, data); + fill_note(note, FILE, size, data); return 0; } @@ -1713,8 +1716,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, regset_get(t->task, &view->regsets[0], sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg); - fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, - PRSTATUS_SIZE, &t->prstatus); + fill_note(&t->notes[0], PRSTATUS, PRSTATUS_SIZE, &t->prstatus); info->size += notesize(&t->notes[0]); do_thread_regset_writeback(t->task, &view->regsets[0]); @@ -1727,6 +1729,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, for (view_iter = 1; view_iter < view->n; ++view_iter) { const struct user_regset *regset = &view->regsets[view_iter]; int note_type = regset->core_note_type; + const char *note_name = regset->core_note_name; bool is_fpreg = note_type == NT_PRFPREG; void *data; int ret; @@ -1747,8 +1750,16 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, if (is_fpreg) SET_PR_FPVALID(&t->prstatus); - fill_note(&t->notes[note_iter], is_fpreg ? NN_PRFPREG : "LINUX", - note_type, ret, data); + /* There should be a note name, but if not, guess: */ + if (WARN_ON_ONCE(!note_name)) + note_name = "LINUX"; + else + /* Warn on non-legacy-compatible names, for now. */ + WARN_ON_ONCE(strcmp(note_name, + is_fpreg ? "CORE" : "LINUX")); + + __fill_note(&t->notes[note_iter], note_name, note_type, + ret, data); info->size += notesize(&t->notes[note_iter]); note_iter++; @@ -1767,8 +1778,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, fill_prstatus(&t->prstatus.common, p, signr); elf_core_copy_task_regs(p, &t->prstatus.pr_reg); - fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, sizeof(t->prstatus), - &(t->prstatus)); + fill_note(&t->notes[0], PRSTATUS, sizeof(t->prstatus), &t->prstatus); info->size += notesize(&t->notes[0]); fpu = kzalloc(sizeof(elf_fpregset_t), GFP_KERNEL); @@ -1778,7 +1788,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, } t->prstatus.pr_fpvalid = 1; - fill_note(&t->notes[1], NN_PRFPREG, NT_PRFPREG, sizeof(*fpu), fpu); + fill_note(&t->notes[1], PRFPREG, sizeof(*fpu), fpu); info->size += notesize(&t->notes[1]); return 1; @@ -1798,7 +1808,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); if (!psinfo) return 0; - fill_note(&info->psinfo, NN_PRPSINFO, NT_PRPSINFO, sizeof(*psinfo), psinfo); + fill_note(&info->psinfo, PRPSINFO, sizeof(*psinfo), psinfo); #ifdef CORE_DUMP_USE_REGSET view = task_user_regset_view(dump_task); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 9133f3827f90..48fd2de3bca0 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -109,7 +109,7 @@ static int is_elf(struct elfhdr *hdr, struct file *file) return 0; if (!elf_check_arch(hdr)) return 0; - if (!file->f_op->mmap) + if (!can_mmap_file(file)) return 0; return 1; } @@ -1275,8 +1275,8 @@ static inline void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offs return; } -static inline void fill_note(struct memelfnote *note, const char *name, int type, - unsigned int sz, void *data) +static inline void __fill_note(struct memelfnote *note, const char *name, int type, + unsigned int sz, void *data) { note->name = name; note->type = type; @@ -1285,6 +1285,9 @@ static inline void fill_note(struct memelfnote *note, const char *name, int type return; } +#define fill_note(note, type, sz, data) \ + __fill_note(note, NN_ ## type, NT_ ## type, sz, data) + /* * fill up all the fields in prstatus from the given task struct, except * registers which need to be filled up separately. @@ -1398,8 +1401,7 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_ regset_get(p, &view->regsets[0], sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg); - fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, sizeof(t->prstatus), - &t->prstatus); + fill_note(&t->notes[0], PRSTATUS, sizeof(t->prstatus), &t->prstatus); t->num_notes++; *sz += notesize(&t->notes[0]); @@ -1416,8 +1418,7 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_ } if (t->prstatus.pr_fpvalid) { - fill_note(&t->notes[1], NN_PRFPREG, NT_PRFPREG, sizeof(t->fpu), - &t->fpu); + fill_note(&t->notes[1], PRFPREG, sizeof(t->fpu), &t->fpu); t->num_notes++; *sz += notesize(&t->notes[1]); } @@ -1531,7 +1532,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) */ fill_psinfo(psinfo, current->group_leader, current->mm); - fill_note(&psinfo_note, NN_PRPSINFO, NT_PRPSINFO, sizeof(*psinfo), psinfo); + fill_note(&psinfo_note, PRPSINFO, sizeof(*psinfo), psinfo); thread_status_size += notesize(&psinfo_note); auxv = (elf_addr_t *) current->mm->saved_auxv; @@ -1539,7 +1540,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) do i += 2; while (auxv[i - 2] != AT_NULL); - fill_note(&auxv_note, NN_AUXV, NT_AUXV, i * sizeof(elf_addr_t), auxv); + fill_note(&auxv_note, AUXV, i * sizeof(elf_addr_t), auxv); thread_status_size += notesize(&auxv_note); offset = sizeof(*elf); /* ELF header */ diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 432fbf4fc334..a839f960cd4a 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -675,44 +675,6 @@ static void bm_evict_inode(struct inode *inode) } /** - * unlink_binfmt_dentry - remove the dentry for the binary type handler - * @dentry: dentry associated with the binary type handler - * - * Do the actual filesystem work to remove a dentry for a registered binary - * type handler. Since binfmt_misc only allows simple files to be created - * directly under the root dentry of the filesystem we ensure that we are - * indeed passed a dentry directly beneath the root dentry, that the inode - * associated with the root dentry is locked, and that it is a regular file we - * are asked to remove. - */ -static void unlink_binfmt_dentry(struct dentry *dentry) -{ - struct dentry *parent = dentry->d_parent; - struct inode *inode, *parent_inode; - - /* All entries are immediate descendants of the root dentry. */ - if (WARN_ON_ONCE(dentry->d_sb->s_root != parent)) - return; - - /* We only expect to be called on regular files. */ - inode = d_inode(dentry); - if (WARN_ON_ONCE(!S_ISREG(inode->i_mode))) - return; - - /* The parent inode must be locked. */ - parent_inode = d_inode(parent); - if (WARN_ON_ONCE(!inode_is_locked(parent_inode))) - return; - - if (simple_positive(dentry)) { - dget(dentry); - simple_unlink(parent_inode, dentry); - d_delete(dentry); - dput(dentry); - } -} - -/** * remove_binfmt_handler - remove a binary type handler * @misc: handle to binfmt_misc instance * @e: binary type handler to remove @@ -729,7 +691,7 @@ static void remove_binfmt_handler(struct binfmt_misc *misc, Node *e) write_lock(&misc->entries_lock); list_del_init(&e->list); write_unlock(&misc->entries_lock); - unlink_binfmt_dentry(e->dentry); + locked_recursive_removal(e->dentry, NULL); } /* /<entry> */ @@ -772,7 +734,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, case 3: /* Delete this handler. */ inode = d_inode(inode->i_sb->s_root); - inode_lock(inode); + inode_lock_nested(inode, I_MUTEX_PARENT); /* * In order to add new element or remove elements from the list @@ -922,7 +884,7 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer, case 3: /* Delete all handlers. */ inode = d_inode(file_inode(file)->i_sb->s_root); - inode_lock(inode); + inode_lock_nested(inode, I_MUTEX_PARENT); /* * In order to add new element or remove elements from the list diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c index 08412532db1b..1e36a12b88f7 100644 --- a/fs/bpf_fs_kfuncs.c +++ b/fs/bpf_fs_kfuncs.c @@ -9,6 +9,7 @@ #include <linux/fs.h> #include <linux/fsnotify.h> #include <linux/file.h> +#include <linux/kernfs.h> #include <linux/mm.h> #include <linux/xattr.h> @@ -322,6 +323,39 @@ __bpf_kfunc int bpf_remove_dentry_xattr(struct dentry *dentry, const char *name_ return ret; } +#ifdef CONFIG_CGROUPS +/** + * bpf_cgroup_read_xattr - read xattr of a cgroup's node in cgroupfs + * @cgroup: cgroup to get xattr from + * @name__str: name of the xattr + * @value_p: output buffer of the xattr value + * + * Get xattr *name__str* of *cgroup* and store the output in *value_ptr*. + * + * For security reasons, only *name__str* with prefix "user." is allowed. + * + * Return: length of the xattr value on success, a negative value on error. + */ +__bpf_kfunc int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__str, + struct bpf_dynptr *value_p) +{ + struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p; + u32 value_len; + void *value; + + /* Only allow reading "user.*" xattrs */ + if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) + return -EPERM; + + value_len = __bpf_dynptr_size(value_ptr); + value = __bpf_dynptr_data_rw(value_ptr, value_len); + if (!value) + return -EINVAL; + + return kernfs_xattr_get(cgroup->kn, name__str, value, value_len); +} +#endif /* CONFIG_CGROUPS */ + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_fs_kfunc_set_ids) diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index c352f3ae0385..ea95c90c8474 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -114,6 +114,8 @@ config BTRFS_EXPERIMENTAL - extent tree v2 - complex rework of extent tracking + - large folio support + If unsure, say N. config BTRFS_FS_REF_VERIFY diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c index e3716516ca38..861c7d92c437 100644 --- a/fs/btrfs/accessors.c +++ b/fs/btrfs/accessors.c @@ -9,27 +9,24 @@ #include "fs.h" #include "accessors.h" -static bool check_setget_bounds(const struct extent_buffer *eb, - const void *ptr, unsigned off, int size) +static void __cold report_setget_bounds(const struct extent_buffer *eb, + const void *ptr, unsigned off, int size) { - const unsigned long member_offset = (unsigned long)ptr + off; + unsigned long member_offset = (unsigned long)ptr + off; - if (unlikely(member_offset + size > eb->len)) { - btrfs_warn(eb->fs_info, - "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d", - (member_offset > eb->len ? "start" : "end"), - (unsigned long)ptr, eb->start, member_offset, size); - return false; - } - - return true; + btrfs_warn(eb->fs_info, + "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d", + (member_offset > eb->len ? "start" : "end"), + (unsigned long)ptr, eb->start, member_offset, size); } -void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb) +/* Copy bytes from @src1 and @src2 to @dest. */ +static __always_inline void memcpy_split_src(char *dest, const char *src1, + const char *src2, const size_t len1, + const size_t total) { - token->eb = eb; - token->kaddr = folio_address(eb->folios[0]); - token->offset = 0; + memcpy(dest, src1, len1); + memcpy(dest + len1, src2, total - len1); } /* @@ -41,11 +38,6 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e * - btrfs_set_8 (for 8/16/32/64) * - btrfs_get_8 (for 8/16/32/64) * - * Generic helpers with a token (cached address of the most recently accessed - * page): - * - btrfs_set_token_8 (for 8/16/32/64) - * - btrfs_get_token_8 (for 8/16/32/64) - * * The set/get functions handle data spanning two pages transparently, in case * metadata block size is larger than page. Every pointer to metadata items is * an offset into the extent buffer page array, cast to a specific type. This @@ -57,118 +49,66 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e */ #define DEFINE_BTRFS_SETGET_BITS(bits) \ -u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ - const void *ptr, unsigned long off) \ -{ \ - const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ - const unsigned long oil = get_eb_offset_in_folio(token->eb, \ - member_offset);\ - const int unit_size = token->eb->folio_size; \ - const int unit_shift = token->eb->folio_shift; \ - const int size = sizeof(u##bits); \ - u8 lebytes[sizeof(u##bits)]; \ - const int part = unit_size - oil; \ - \ - ASSERT(token); \ - ASSERT(token->kaddr); \ - ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \ - if (token->offset <= member_offset && \ - member_offset + size <= token->offset + unit_size) { \ - return get_unaligned_le##bits(token->kaddr + oil); \ - } \ - token->kaddr = folio_address(token->eb->folios[idx]); \ - token->offset = idx << unit_shift; \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \ - return get_unaligned_le##bits(token->kaddr + oil); \ - \ - memcpy(lebytes, token->kaddr + oil, part); \ - token->kaddr = folio_address(token->eb->folios[idx + 1]); \ - token->offset = (idx + 1) << unit_shift; \ - memcpy(lebytes + part, token->kaddr, size - part); \ - return get_unaligned_le##bits(lebytes); \ -} \ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const void *ptr, unsigned long off) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ const unsigned long idx = get_eb_folio_index(eb, member_offset);\ - const unsigned long oil = get_eb_offset_in_folio(eb, \ - member_offset);\ - const int unit_size = eb->folio_size; \ - char *kaddr = folio_address(eb->folios[idx]); \ - const int size = sizeof(u##bits); \ - const int part = unit_size - oil; \ - u8 lebytes[sizeof(u##bits)]; \ - \ - ASSERT(check_setget_bounds(eb, ptr, off, size)); \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \ - return get_unaligned_le##bits(kaddr + oil); \ - \ - memcpy(lebytes, kaddr + oil, part); \ - kaddr = folio_address(eb->folios[idx + 1]); \ - memcpy(lebytes + part, kaddr, size - part); \ - return get_unaligned_le##bits(lebytes); \ -} \ -void btrfs_set_token_##bits(struct btrfs_map_token *token, \ - const void *ptr, unsigned long off, \ - u##bits val) \ -{ \ - const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ - const unsigned long oil = get_eb_offset_in_folio(token->eb, \ + const unsigned long oif = get_eb_offset_in_folio(eb, \ member_offset);\ - const int unit_size = token->eb->folio_size; \ - const int unit_shift = token->eb->folio_shift; \ - const int size = sizeof(u##bits); \ + char *kaddr = folio_address(eb->folios[idx]) + oif; \ + const int part = eb->folio_size - oif; \ u8 lebytes[sizeof(u##bits)]; \ - const int part = unit_size - oil; \ \ - ASSERT(token); \ - ASSERT(token->kaddr); \ - ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \ - if (token->offset <= member_offset && \ - member_offset + size <= token->offset + unit_size) { \ - put_unaligned_le##bits(val, token->kaddr + oil); \ - return; \ + if (unlikely(member_offset + sizeof(u##bits) > eb->len)) { \ + report_setget_bounds(eb, ptr, off, sizeof(u##bits)); \ + return 0; \ } \ - token->kaddr = folio_address(token->eb->folios[idx]); \ - token->offset = idx << unit_shift; \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || \ - oil + size <= unit_size) { \ - put_unaligned_le##bits(val, token->kaddr + oil); \ - return; \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || sizeof(u##bits) == 1 || \ + likely(sizeof(u##bits) <= part)) \ + return get_unaligned_le##bits(kaddr); \ + \ + if (sizeof(u##bits) == 2) { \ + lebytes[0] = *kaddr; \ + kaddr = folio_address(eb->folios[idx + 1]); \ + lebytes[1] = *kaddr; \ + } else { \ + memcpy_split_src(lebytes, kaddr, \ + folio_address(eb->folios[idx + 1]), \ + part, sizeof(u##bits)); \ } \ - put_unaligned_le##bits(val, lebytes); \ - memcpy(token->kaddr + oil, lebytes, part); \ - token->kaddr = folio_address(token->eb->folios[idx + 1]); \ - token->offset = (idx + 1) << unit_shift; \ - memcpy(token->kaddr, lebytes + part, size - part); \ + return get_unaligned_le##bits(lebytes); \ } \ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ unsigned long off, u##bits val) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ const unsigned long idx = get_eb_folio_index(eb, member_offset);\ - const unsigned long oil = get_eb_offset_in_folio(eb, \ + const unsigned long oif = get_eb_offset_in_folio(eb, \ member_offset);\ - const int unit_size = eb->folio_size; \ - char *kaddr = folio_address(eb->folios[idx]); \ - const int size = sizeof(u##bits); \ - const int part = unit_size - oil; \ + char *kaddr = folio_address(eb->folios[idx]) + oif; \ + const int part = eb->folio_size - oif; \ u8 lebytes[sizeof(u##bits)]; \ \ - ASSERT(check_setget_bounds(eb, ptr, off, size)); \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || \ - oil + size <= unit_size) { \ - put_unaligned_le##bits(val, kaddr + oil); \ + if (unlikely(member_offset + sizeof(u##bits) > eb->len)) { \ + report_setget_bounds(eb, ptr, off, sizeof(u##bits)); \ + return; \ + } \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || sizeof(u##bits) == 1 || \ + likely(sizeof(u##bits) <= part)) { \ + put_unaligned_le##bits(val, kaddr); \ return; \ } \ - \ put_unaligned_le##bits(val, lebytes); \ - memcpy(kaddr + oil, lebytes, part); \ - kaddr = folio_address(eb->folios[idx + 1]); \ - memcpy(kaddr, lebytes + part, size - part); \ + if (sizeof(u##bits) == 2) { \ + *kaddr = lebytes[0]; \ + kaddr = folio_address(eb->folios[idx + 1]); \ + *kaddr = lebytes[1]; \ + } else { \ + memcpy(kaddr, lebytes, part); \ + kaddr = folio_address(eb->folios[idx + 1]); \ + memcpy(kaddr, lebytes + part, sizeof(u##bits) - part); \ + } \ } DEFINE_BTRFS_SETGET_BITS(8) diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 15ea6348800b..99b3ced12805 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -16,14 +16,6 @@ struct extent_buffer; -struct btrfs_map_token { - struct extent_buffer *eb; - char *kaddr; - unsigned long offset; -}; - -void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb); - /* * Some macros to generate set/get functions for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple one @@ -56,11 +48,6 @@ static inline void put_unaligned_le8(u8 val, void *p) sizeof_field(type, member))) #define DECLARE_BTRFS_SETGET_BITS(bits) \ -u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ - const void *ptr, unsigned long off); \ -void btrfs_set_token_##bits(struct btrfs_map_token *token, \ - const void *ptr, unsigned long off, \ - u##bits val); \ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const void *ptr, unsigned long off); \ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ @@ -83,18 +70,6 @@ static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ { \ static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ btrfs_set_##bits(eb, s, offsetof(type, member), val); \ -} \ -static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ - const type *s) \ -{ \ - static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ - return btrfs_get_token_##bits(token, s, offsetof(type, member));\ -} \ -static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ - type *s, u##bits val) \ -{ \ - static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ - btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ } #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ @@ -479,18 +454,6 @@ static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \ int slot, u32 val) \ { \ btrfs_set_raw_item_##member(eb, btrfs_item_nr(eb, slot), val); \ -} \ -static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \ - int slot) \ -{ \ - struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \ - return btrfs_token_raw_item_##member(token, item); \ -} \ -static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \ - int slot, u32 val) \ -{ \ - struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \ - btrfs_set_token_raw_item_##member(token, item, val); \ } BTRFS_ITEM_SETGET_FUNCS(offset) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ed497f5f8d1b..6a450be293b1 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -733,7 +733,6 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx, struct preftrees *preftrees, struct share_check *sc) { - int err; int ret = 0; struct ulist *parents; struct ulist_node *node; @@ -752,6 +751,7 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx, */ while ((rnode = rb_first_cached(&preftrees->indirect.root))) { struct prelim_ref *ref; + int ret2; ref = rb_entry(rnode, struct prelim_ref, rbnode); if (WARN(ref->parent, @@ -773,18 +773,18 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx, ret = BACKREF_FOUND_SHARED; goto out; } - err = resolve_indirect_ref(ctx, path, preftrees, ref, parents); + ret2 = resolve_indirect_ref(ctx, path, preftrees, ref, parents); /* * we can only tolerate ENOENT,otherwise,we should catch error * and return directly. */ - if (err == -ENOENT) { + if (ret2 == -ENOENT) { prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref, NULL); continue; - } else if (err) { + } else if (ret2) { free_pref(ref); - ret = err; + ret = ret2; goto out; } @@ -2201,7 +2201,6 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, int ret; u64 flags; u64 size = 0; - u32 item_size; const struct extent_buffer *eb; struct btrfs_extent_item *ei; struct btrfs_key key; @@ -2244,7 +2243,6 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, } eb = path->nodes[0]; - item_size = btrfs_item_size(eb, path->slots[0]); ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); flags = btrfs_extent_flags(eb, ei); @@ -2252,7 +2250,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, btrfs_debug(fs_info, "logical %llu is at position %llu within the extent (%llu EXTENT_ITEM %llu) flags %#llx size %u", logical, logical - found_key->objectid, found_key->objectid, - found_key->offset, flags, item_size); + found_key->offset, flags, btrfs_item_size(eb, path->slots[0])); WARN_ON(!flags_ret); if (flags_ret) { @@ -2548,17 +2546,20 @@ static int build_ino_list(u64 inum, u64 offset, u64 num_bytes, u64 root, void *c } int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, - struct btrfs_path *path, void *ctx, bool ignore_offset) { struct btrfs_backref_walk_ctx walk_ctx = { 0 }; int ret; u64 flags = 0; struct btrfs_key found_key; - int search_commit_root = path->search_commit_root; + struct btrfs_path *path; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; ret = extent_from_logical(fs_info, logical, path, &found_key, &flags); - btrfs_release_path(path); + btrfs_free_path(path); if (ret < 0) return ret; if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) @@ -2571,8 +2572,7 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, walk_ctx.extent_item_pos = logical - found_key.objectid; walk_ctx.fs_info = fs_info; - return iterate_extent_inodes(&walk_ctx, search_commit_root, - build_ino_list, ctx); + return iterate_extent_inodes(&walk_ctx, false, build_ino_list, ctx); } static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, @@ -3161,18 +3161,14 @@ void btrfs_backref_release_cache(struct btrfs_backref_cache *cache) ASSERT(!cache->nr_edges); } -void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, - struct btrfs_backref_node *lower, - struct btrfs_backref_node *upper, - int link_which) +static void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, + struct btrfs_backref_node *lower, + struct btrfs_backref_node *upper) { ASSERT(upper && lower && upper->level == lower->level + 1); edge->node[LOWER] = lower; edge->node[UPPER] = upper; - if (link_which & LINK_LOWER) - list_add_tail(&edge->list[LOWER], &lower->upper); - if (link_which & LINK_UPPER) - list_add_tail(&edge->list[UPPER], &upper->lower); + list_add_tail(&edge->list[LOWER], &lower->upper); } /* * Handle direct tree backref @@ -3242,7 +3238,7 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache, ASSERT(upper->checked); INIT_LIST_HEAD(&edge->list[UPPER]); } - btrfs_backref_link_edge(edge, cur, upper, LINK_LOWER); + btrfs_backref_link_edge(edge, cur, upper); return 0; } @@ -3412,7 +3408,7 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, if (!upper->owner) upper->owner = btrfs_header_owner(eb); } - btrfs_backref_link_edge(edge, lower, upper, LINK_LOWER); + btrfs_backref_link_edge(edge, lower, upper); if (rb_node) { btrfs_put_root(root); @@ -3570,7 +3566,7 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, ASSERT(start->checked); - rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, &start->rb_node); + rb_node = rb_simple_insert(&cache->rb_root, &start->simple_node); if (rb_node) btrfs_backref_panic(cache->fs_info, start->bytenr, -EEXIST); @@ -3621,8 +3617,7 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, return -EUCLEAN; } - rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr, - &upper->rb_node); + rb_node = rb_simple_insert(&cache->rb_root, &upper->simple_node); if (unlikely(rb_node)) { btrfs_backref_panic(cache->fs_info, upper->bytenr, -EEXIST); return -EUCLEAN; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 953637115956..34b0193a181c 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -226,8 +226,7 @@ int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, iterate_extent_inodes_t *iterate, void *user_ctx); int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, - struct btrfs_path *path, void *ctx, - bool ignore_offset); + void *ctx, bool ignore_offset); int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); @@ -313,10 +312,15 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter); * Represent a tree block in the backref cache */ struct btrfs_backref_node { - struct { - struct rb_node rb_node; - u64 bytenr; - }; /* Use rb_simple_node for search/insert */ + union{ + /* Use rb_simple_node for search/insert */ + struct { + struct rb_node rb_node; + u64 bytenr; + }; + + struct rb_simple_node simple_node; + }; /* * This is a sanity check, whenever we COW a block we will update @@ -423,13 +427,6 @@ struct btrfs_backref_node *btrfs_backref_alloc_node( struct btrfs_backref_edge *btrfs_backref_alloc_edge( struct btrfs_backref_cache *cache); -#define LINK_LOWER (1U << 0) -#define LINK_UPPER (1U << 1) - -void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, - struct btrfs_backref_node *lower, - struct btrfs_backref_node *upper, - int link_which); void btrfs_backref_free_node(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node); void btrfs_backref_free_edge(struct btrfs_backref_cache *cache, diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index f7d8958b7327..50b5fc1c06d7 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -27,12 +27,12 @@ struct btrfs_failed_bio { }; /* Is this a data path I/O that needs storage layer checksum and repair? */ -static inline bool is_data_bbio(struct btrfs_bio *bbio) +static inline bool is_data_bbio(const struct btrfs_bio *bbio) { return bbio->inode && is_data_inode(bbio->inode); } -static bool bbio_has_ordered_extent(struct btrfs_bio *bbio) +static bool bbio_has_ordered_extent(const struct btrfs_bio *bbio) { return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE; } @@ -134,14 +134,14 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) } } -static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) +static int next_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror) { if (cur_mirror == fbio->num_copies) return cur_mirror + 1 - fbio->num_copies; return cur_mirror + 1; } -static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) +static int prev_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror) { if (cur_mirror == 1) return fbio->num_copies; @@ -165,12 +165,6 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); int mirror = repair_bbio->mirror_num; - /* - * We can only trigger this for data bio, which doesn't support larger - * folios yet. - */ - ASSERT(folio_order(page_folio(bv->bv_page)) == 0); - if (repair_bbio->bio.bi_status || !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); @@ -301,7 +295,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de btrfs_bio_end_io(bbio, bbio->bio.bi_status); } -static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) +static void btrfs_log_dev_io_error(const struct bio *bio, struct btrfs_device *dev) { if (!dev || !dev->bdev) return; @@ -316,8 +310,8 @@ static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); } -static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, - struct bio *bio) +static struct workqueue_struct *btrfs_end_io_wq(const struct btrfs_fs_info *fs_info, + const struct bio *bio) { if (bio->bi_opf & REQ_META) return fs_info->endio_meta_workers; @@ -439,7 +433,7 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) ASSERT(btrfs_dev_is_sequential(dev, physical)); bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; } - btrfs_debug_in_rcu(dev->fs_info, + btrfs_debug(dev->fs_info, "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), @@ -845,7 +839,7 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, goto out_bio_uninit; } - btrfs_info_rl_in_rcu(fs_info, + btrfs_info_rl(fs_info, "read error corrected: ino %llu off %llu (dev %s sector %llu)", ino, start, btrfs_dev_name(smap.dev), smap.physical >> SECTOR_SHIFT); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5b0cb04b2b93..9bf282d2453c 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -34,6 +34,19 @@ int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group } #endif +static inline bool has_unwritten_metadata(struct btrfs_block_group *block_group) +{ + /* The meta_write_pointer is available only on the zoned setup. */ + if (!btrfs_is_zoned(block_group->fs_info)) + return false; + + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) + return false; + + return block_group->start + block_group->alloc_offset > + block_group->meta_write_pointer; +} + /* * Return target flags in extended format or 0 if restripe for this chunk_type * is not in progress @@ -832,8 +845,8 @@ out: static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg) { - btrfs_clear_extent_bits(&bg->fs_info->excluded_extents, bg->start, - bg->start + bg->length - 1, EXTENT_DIRTY); + btrfs_clear_extent_bit(&bg->fs_info->excluded_extents, bg->start, + bg->start + bg->length - 1, EXTENT_DIRTY, NULL); } static noinline void caching_thread(struct btrfs_work *work) @@ -877,7 +890,7 @@ static noinline void caching_thread(struct btrfs_work *work) */ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags))) - ret = load_free_space_tree(caching_ctl); + ret = btrfs_load_free_space_tree(caching_ctl); else ret = load_extent_tree_free(caching_ctl); done: @@ -1235,7 +1248,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * another task to attempt to create another block group with the same * item key (and failing with -EEXIST and a transaction abort). */ - ret = remove_block_group_free_space(trans, block_group); + ret = btrfs_remove_block_group_free_space(trans, block_group); if (ret) goto out; @@ -1244,6 +1257,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, goto out; spin_lock(&block_group->lock); + /* + * Hitting this WARN means we removed a block group with an unwritten + * region. It will cause "unable to find chunk map for logical" errors. + */ + if (WARN_ON(has_unwritten_metadata(block_group))) + btrfs_warn(fs_info, + "block group %llu is removed before metadata write out", + block_group->start); + set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags); /* @@ -1403,7 +1425,7 @@ out: if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { btrfs_info(cache->fs_info, "unable to make block group %llu ro", cache->start); - btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); + btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, false); } return ret; } @@ -1436,14 +1458,14 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans, */ mutex_lock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) { - ret = btrfs_clear_extent_bits(&prev_trans->pinned_extents, start, end, - EXTENT_DIRTY); + ret = btrfs_clear_extent_bit(&prev_trans->pinned_extents, start, end, + EXTENT_DIRTY, NULL); if (ret) goto out; } - ret = btrfs_clear_extent_bits(&trans->transaction->pinned_extents, start, end, - EXTENT_DIRTY); + ret = btrfs_clear_extent_bit(&trans->transaction->pinned_extents, start, end, + EXTENT_DIRTY, NULL); out: mutex_unlock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) @@ -1586,8 +1608,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * needing to allocate extents from the block group. */ used = btrfs_space_info_used(space_info, true); - if (space_info->total_bytes - block_group->length < used && - block_group->zone_unusable < block_group->length) { + if ((space_info->total_bytes - block_group->length < used && + block_group->zone_unusable < block_group->length) || + has_unwritten_metadata(block_group)) { /* * Add a reference for the list, compensate for the ref * drop under the "next" label for the @@ -1616,8 +1639,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) ret = btrfs_zone_finish(block_group); if (ret < 0) { btrfs_dec_block_group_ro(block_group); - if (ret == -EAGAIN) + if (ret == -EAGAIN) { + btrfs_link_bg_list(block_group, &retry_list); ret = 0; + } goto next; } @@ -1843,7 +1868,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) */ list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); while (!list_empty(&fs_info->reclaim_bgs)) { - u64 zone_unusable; u64 used; u64 reserved; int ret = 0; @@ -1910,16 +1934,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) goto next; } - /* - * Cache the zone_unusable value before turning the block group - * to read only. As soon as the block group is read only it's - * zone_unusable value gets moved to the block group's read-only - * bytes and isn't available for calculations anymore. We also - * cache it before unlocking the block group, to prevent races - * (reports from KCSAN and such tools) with tasks updating it. - */ - zone_unusable = bg->zone_unusable; - spin_unlock(&bg->lock); spin_unlock(&space_info->lock); @@ -1963,14 +1977,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) reserved = bg->reserved; spin_unlock(&bg->lock); - btrfs_info(fs_info, - "reclaiming chunk %llu with %llu%% used %llu%% reserved %llu%% unusable", - bg->start, - div64_u64(used * 100, bg->length), - div64_u64(reserved * 100, bg->length), - div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); - ret = btrfs_relocate_chunk(fs_info, bg->start); + ret = btrfs_relocate_chunk(fs_info, bg->start, false); if (ret) { btrfs_dec_block_group_ro(bg); btrfs_err(fs_info, "error relocating chunk %llu", @@ -2372,7 +2380,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); cache->space_info = btrfs_find_space_info(info, cache->flags); - set_free_space_tree_thresholds(cache); + btrfs_set_free_space_tree_thresholds(cache); if (need_clear) { /* @@ -2791,7 +2799,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) block_group->length); if (ret) btrfs_abort_transaction(trans, ret); - add_block_group_free_space(trans, block_group); + btrfs_add_block_group_free_space(trans, block_group); /* * If we restriped during balance, we may have added a new raid @@ -2889,7 +2897,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags); cache->length = size; - set_free_space_tree_thresholds(cache); + btrfs_set_free_space_tree_thresholds(cache); cache->flags = type; cache->cached = BTRFS_CACHE_FINISHED; cache->global_root_id = calculate_global_root_id(fs_info, cache->start); @@ -3636,9 +3644,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); ret = update_block_group_item(trans, path, cache); - } - if (ret) + if (ret) + btrfs_abort_transaction(trans, ret); + } else if (ret) { btrfs_abort_transaction(trans, ret); + } } /* If its not on the io list, we need to put the block group */ @@ -4298,7 +4308,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", left, bytes, type); - btrfs_dump_space_info(fs_info, info, 0, 0); + btrfs_dump_space_info(fs_info, info, 0, false); } if (left < bytes) { @@ -4443,7 +4453,7 @@ static void check_removing_space_info(struct btrfs_space_info *space_info) * indicates a real bug if this happens. */ if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_may_use > 0)) - btrfs_dump_space_info(info, space_info, 0, 0); + btrfs_dump_space_info(info, space_info, 0, false); /* * If there was a failure to cleanup a log tree, very likely due to an @@ -4454,7 +4464,7 @@ static void check_removing_space_info(struct btrfs_space_info *space_info) if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { if (WARN_ON(space_info->bytes_reserved > 0)) - btrfs_dump_space_info(info, space_info, 0, 0); + btrfs_dump_space_info(info, space_info, 0, false); } WARN_ON(space_info->reclaim_size > 0); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 9de356bcb411..a8bb8429c966 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -83,6 +83,8 @@ enum btrfs_block_group_flags { BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, /* Does the block group need to be added to the free space tree? */ BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, + /* Set after we add a new block group to the free space tree. */ + BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, /* Indicate that the block group is placed on a sequential zone */ BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, /* @@ -244,6 +246,11 @@ struct btrfs_block_group { /* Lock for free space tree operations. */ struct mutex free_space_lock; + /* Protected by @free_space_lock. */ + bool using_free_space_bitmaps; + /* Protected by @free_space_lock. */ + bool using_free_space_bitmaps_cached; + /* * Number of extents in this block group used for swap files. * All accesses protected by the spinlock 'lock'. diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index a79fa0726f1d..b99fb0273292 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -525,6 +525,19 @@ static inline void btrfs_update_inode_mapping_flags(struct btrfs_inode *inode) mapping_set_stable_writes(inode->vfs_inode.i_mapping); } +static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode) +{ + /* Metadata inode should not reach here. */ + ASSERT(is_data_inode(inode)); + + /* We only allow BITS_PER_LONGS blocks for each bitmap. */ +#ifdef CONFIG_BTRFS_EXPERIMENTAL + mapping_set_folio_order_range(inode->vfs_inode.i_mapping, 0, + ilog2(((BITS_PER_LONG << inode->root->fs_info->sectorsize_bits) + >> PAGE_SHIFT))); +#endif +} + /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 48d07939fee4..d09d622016ef 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -282,8 +282,8 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb) { struct inode *inode = &cb->bbio.inode->vfs_inode; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - unsigned long index = cb->start >> PAGE_SHIFT; - unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; + pgoff_t index = cb->start >> PAGE_SHIFT; + const pgoff_t end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct folio_batch fbatch; int i; int ret; @@ -415,7 +415,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, int *memstall, unsigned long *pflags) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - unsigned long end_index; + pgoff_t end_index; struct bio *orig_bio = &cb->orig_bbio->bio; u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size; u64 isize = i_size_read(inode); @@ -446,8 +446,8 @@ static noinline int add_ra_bio_pages(struct inode *inode, end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; while (cur < compressed_end) { - u64 page_end; - u64 pg_index = cur >> PAGE_SHIFT; + pgoff_t page_end; + pgoff_t pg_index = cur >> PAGE_SHIFT; u32 add_size; if (pg_index > end_index) @@ -789,8 +789,8 @@ static void btrfs_init_workspace_manager(int type) */ workspace = alloc_workspace(type, 0); if (IS_ERR(workspace)) { - pr_warn( - "BTRFS: cannot preallocate compression workspace, will try later\n"); + btrfs_warn(NULL, + "cannot preallocate compression workspace, will try later"); } else { atomic_set(&wsm->total_ws, 1); wsm->free_ws = 1; @@ -888,9 +888,9 @@ again: /* once per minute */ 60 * HZ, /* no burst */ 1); - if (__ratelimit(&_rs)) { - pr_warn("BTRFS: no compression workspaces, low memory, retrying\n"); - } + if (__ratelimit(&_rs)) + btrfs_warn(NULL, + "no compression workspaces, low memory, retrying"); } goto again; } @@ -975,7 +975,7 @@ static int btrfs_compress_set_level(unsigned int type, int level) if (level == 0) level = ops->default_level; else - level = min(max(level, ops->min_level), ops->max_level); + level = clamp(level, ops->min_level, ops->max_level); return level; } @@ -1482,7 +1482,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, struct heuristic_ws *ws) { struct page *page; - u64 index, index_end; + pgoff_t index, index_end; u32 i, curr_sample_pos; u8 *in_data; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index d34c4341eaf4..1b38e707bbd9 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -13,6 +13,7 @@ #include <linux/wait.h> #include <linux/pagemap.h> #include "bio.h" +#include "fs.h" #include "messages.h" struct address_space; @@ -77,12 +78,10 @@ struct compressed_bio { /* @range_end must be exclusive. */ static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u64 cur) { - const u64 folio_end = folio_pos(folio) + folio_size(folio); - /* @cur must be inside the folio. */ ASSERT(folio_pos(folio) <= cur); - ASSERT(cur < folio_end); - return min(range_end, folio_end) - cur; + ASSERT(cur < folio_end(folio)); + return min(range_end, folio_end(folio)) - cur; } int __init btrfs_init_compress(void); @@ -114,6 +113,8 @@ enum btrfs_compression_type { BTRFS_COMPRESS_LZO = 2, BTRFS_COMPRESS_ZSTD = 3, BTRFS_NR_COMPRESS_TYPES = 4, + + BTRFS_DEFRAG_DONT_COMPRESS, }; struct workspace_manager { diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a2e7979372cc..74e6d7f3d266 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -198,7 +198,7 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) * the inc_not_zero dance and if it doesn't work then * synchronize_rcu and try again. */ - if (atomic_inc_not_zero(&eb->refs)) { + if (refcount_inc_not_zero(&eb->refs)) { rcu_read_unlock(); break; } @@ -283,15 +283,26 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); - WARN_ON(btrfs_header_generation(buf) > trans->transid); - if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + if (unlikely(btrfs_header_generation(buf) > trans->transid)) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + return ret; + } + + if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) { ret = btrfs_inc_ref(trans, root, cow, 1); - else + if (ret) + btrfs_abort_transaction(trans, ret); + } else { ret = btrfs_inc_ref(trans, root, cow, 0); + if (ret) + btrfs_abort_transaction(trans, ret); + } if (ret) { btrfs_tree_unlock(cow); free_extent_buffer(cow); - btrfs_abort_transaction(trans, ret); return ret; } @@ -303,9 +314,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, /* * check if the tree block can be shared by multiple trees */ -bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf) +bool btrfs_block_can_be_shared(const struct btrfs_trans_handle *trans, + const struct btrfs_root *root, + const struct extent_buffer *buf) { const u64 buf_gen = btrfs_header_generation(buf); @@ -549,7 +560,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); goto error_unlock_cow; } - atomic_inc(&cow->refs); + refcount_inc(&cow->refs); rcu_assign_pointer(root->node, cow); ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf, @@ -602,9 +613,9 @@ error_unlock_cow: return ret; } -static inline int should_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf) +static inline int should_cow_block(const struct btrfs_trans_handle *trans, + const struct btrfs_root *root, + const struct extent_buffer *buf) { if (btrfs_is_testing(root->fs_info)) return 0; @@ -724,7 +735,7 @@ int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_ke * Slot may point to the total number of items (i.e. one position beyond the last * key) if the key is bigger than the last key in the extent buffer. */ -int btrfs_bin_search(struct extent_buffer *eb, int first_slot, +int btrfs_bin_search(const struct extent_buffer *eb, int first_slot, const struct btrfs_key *key, int *slot) { unsigned long p; @@ -1081,7 +1092,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* update the path */ if (left) { if (btrfs_header_nritems(left) > orig_slot) { - atomic_inc(&left->refs); + refcount_inc(&left->refs); /* left was locked after cow */ path->nodes[level] = left; path->slots[level + 1] -= 1; @@ -1268,7 +1279,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, * to the block in 'slot', and triggering ra on them. */ static void reada_for_search(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, + const struct btrfs_path *path, int level, int slot, u64 objectid) { struct extent_buffer *node; @@ -1350,7 +1361,7 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, } } -static noinline void reada_for_balance(struct btrfs_path *path, int level) +static noinline void reada_for_balance(const struct btrfs_path *path, int level) { struct extent_buffer *parent; int slot; @@ -1446,8 +1457,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, u64 blocknr; struct extent_buffer *tmp = NULL; int ret = 0; + int ret2; int parent_level; - int err; bool read_tmp = false; bool tmp_locked = false; bool path_released = false; @@ -1505,9 +1516,9 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, } /* Now we're allowed to do a blocking uptodate check. */ - err = btrfs_read_extent_buffer(tmp, &check); - if (err) { - ret = err; + ret2 = btrfs_read_extent_buffer(tmp, &check); + if (ret2) { + ret = ret2; goto out; } @@ -1548,9 +1559,9 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, } /* Now we're allowed to do a blocking uptodate check. */ - err = btrfs_read_extent_buffer(tmp, &check); - if (err) { - ret = err; + ret2 = btrfs_read_extent_buffer(tmp, &check); + if (ret2) { + ret = ret2; goto out; } @@ -1685,7 +1696,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, if (p->search_commit_root) { b = root->commit_root; - atomic_inc(&b->refs); + refcount_inc(&b->refs); level = btrfs_header_level(b); /* * Ensure that all callers have set skip_locking when @@ -1794,7 +1805,7 @@ static int finish_need_commit_sem_search(struct btrfs_path *path) return 0; } -static inline int search_for_key_slot(struct extent_buffer *eb, +static inline int search_for_key_slot(const struct extent_buffer *eb, int search_low_slot, const struct btrfs_key *key, int prev_cmp, @@ -1928,15 +1939,14 @@ static int search_leaf(struct btrfs_trans_handle *trans, ASSERT(leaf_free_space >= 0); if (leaf_free_space < ins_len) { - int err; - - err = split_leaf(trans, root, key, path, ins_len, - (ret == 0)); - ASSERT(err <= 0); - if (WARN_ON(err > 0)) - err = -EUCLEAN; - if (err) - ret = err; + int ret2; + + ret2 = split_leaf(trans, root, key, path, ins_len, (ret == 0)); + ASSERT(ret2 <= 0); + if (WARN_ON(ret2 > 0)) + ret2 = -EUCLEAN; + if (ret2) + ret = ret2; } } @@ -1982,7 +1992,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *b; int slot; int ret; - int err; int level; int lowest_unlock = 1; /* everything at write_lock_level or lower must be write locked */ @@ -2053,6 +2062,7 @@ again: while (b) { int dec = 0; + int ret2; level = btrfs_header_level(b); @@ -2081,16 +2091,15 @@ again: } if (last_level) - err = btrfs_cow_block(trans, root, b, NULL, 0, - &b, - BTRFS_NESTING_COW); + ret2 = btrfs_cow_block(trans, root, b, NULL, 0, + &b, BTRFS_NESTING_COW); else - err = btrfs_cow_block(trans, root, b, - p->nodes[level + 1], - p->slots[level + 1], &b, - BTRFS_NESTING_COW); - if (err) { - ret = err; + ret2 = btrfs_cow_block(trans, root, b, + p->nodes[level + 1], + p->slots[level + 1], &b, + BTRFS_NESTING_COW); + if (ret2) { + ret = ret2; goto done; } } @@ -2138,12 +2147,12 @@ cow_done: slot--; } p->slots[level] = slot; - err = setup_nodes_for_search(trans, root, p, b, level, ins_len, - &write_lock_level); - if (err == -EAGAIN) + ret2 = setup_nodes_for_search(trans, root, p, b, level, ins_len, + &write_lock_level); + if (ret2 == -EAGAIN) goto again; - if (err) { - ret = err; + if (ret2) { + ret = ret2; goto done; } b = p->nodes[level]; @@ -2169,11 +2178,11 @@ cow_done: goto done; } - err = read_block_for_search(root, p, &b, slot, key); - if (err == -EAGAIN && !p->nowait) + ret2 = read_block_for_search(root, p, &b, slot, key); + if (ret2 == -EAGAIN && !p->nowait) goto again; - if (err) { - ret = err; + if (ret2) { + ret = ret2; goto done; } @@ -2236,7 +2245,6 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, struct extent_buffer *b; int slot; int ret; - int err; int level; int lowest_unlock = 1; u8 lowest_level = 0; @@ -2261,6 +2269,7 @@ again: while (b) { int dec = 0; + int ret2; level = btrfs_header_level(b); p->nodes[level] = b; @@ -2296,11 +2305,11 @@ again: goto done; } - err = read_block_for_search(root, p, &b, slot, key); - if (err == -EAGAIN && !p->nowait) + ret2 = read_block_for_search(root, p, &b, slot, key); + if (ret2 == -EAGAIN && !p->nowait) goto again; - if (err) { - ret = err; + if (ret2) { + ret = ret2; goto done; } @@ -2872,6 +2881,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, if (ret < 0) { int ret2; + btrfs_clear_buffer_dirty(trans, c); ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); if (ret2 < 0) btrfs_abort_transaction(trans, ret2); @@ -2885,7 +2895,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, free_extent_buffer(old); add_root_to_dirty_list(root); - atomic_inc(&c->refs); + refcount_inc(&c->refs); path->nodes[level] = c; path->locks[level] = BTRFS_WRITE_LOCK; path->slots[level] = 0; @@ -3100,7 +3110,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = right->fs_info; struct extent_buffer *left = path->nodes[0]; struct extent_buffer *upper = path->nodes[1]; - struct btrfs_map_token token; struct btrfs_disk_key disk_key; int slot; u32 i; @@ -3174,13 +3183,12 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, copy_leaf_items(right, left, 0, left_nritems - push_items, push_items); /* update the item pointers */ - btrfs_init_map_token(&token, right); right_nritems += push_items; btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(fs_info); for (i = 0; i < right_nritems; i++) { - push_space -= btrfs_token_item_size(&token, i); - btrfs_set_token_item_offset(&token, i, push_space); + push_space -= btrfs_item_size(right, i); + btrfs_set_item_offset(right, i, push_space); } left_nritems -= push_items; @@ -3323,7 +3331,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, int ret = 0; u32 this_item_size; u32 old_left_item_size; - struct btrfs_map_token token; if (empty) nr = min(right_nritems, max_slot); @@ -3371,13 +3378,12 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, old_left_nritems = btrfs_header_nritems(left); BUG_ON(old_left_nritems <= 0); - btrfs_init_map_token(&token, left); old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1); for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { u32 ioff; - ioff = btrfs_token_item_offset(&token, i); - btrfs_set_token_item_offset(&token, i, + ioff = btrfs_item_offset(left, i); + btrfs_set_item_offset(left, i, ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); } btrfs_set_header_nritems(left, old_left_nritems + push_items); @@ -3398,13 +3404,12 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, btrfs_header_nritems(right) - push_items); } - btrfs_init_map_token(&token, right); right_nritems -= push_items; btrfs_set_header_nritems(right, right_nritems); push_space = BTRFS_LEAF_DATA_SIZE(fs_info); for (i = 0; i < right_nritems; i++) { - push_space = push_space - btrfs_token_item_size(&token, i); - btrfs_set_token_item_offset(&token, i, push_space); + push_space = push_space - btrfs_item_size(right, i); + btrfs_set_item_offset(right, i, push_space); } btrfs_mark_buffer_dirty(trans, left); @@ -3518,7 +3523,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, int i; int ret; struct btrfs_disk_key disk_key; - struct btrfs_map_token token; nritems = nritems - mid; btrfs_set_header_nritems(right, nritems); @@ -3531,12 +3535,11 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); - btrfs_init_map_token(&token, right); for (i = 0; i < nritems; i++) { u32 ioff; - ioff = btrfs_token_item_offset(&token, i); - btrfs_set_token_item_offset(&token, i, ioff + rt_data_off); + ioff = btrfs_item_offset(right, i); + btrfs_set_item_offset(right, i, ioff + rt_data_off); } btrfs_set_header_nritems(l, mid); @@ -4002,7 +4005,6 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans, unsigned int old_size; unsigned int size_diff; int i; - struct btrfs_map_token token; leaf = path->nodes[0]; slot = path->slots[0]; @@ -4025,12 +4027,11 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans, * item0..itemN ... dataN.offset..dataN.size .. data0.size */ /* first correct the data pointers */ - btrfs_init_map_token(&token, leaf); for (i = slot; i < nritems; i++) { u32 ioff; - ioff = btrfs_token_item_offset(&token, i); - btrfs_set_token_item_offset(&token, i, ioff + size_diff); + ioff = btrfs_item_offset(leaf, i); + btrfs_set_item_offset(leaf, i, ioff + size_diff); } /* shift the data */ @@ -4093,7 +4094,6 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, unsigned int old_data; unsigned int old_size; int i; - struct btrfs_map_token token; leaf = path->nodes[0]; @@ -4119,12 +4119,11 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, * item0..itemN ... dataN.offset..dataN.size .. data0.size */ /* first correct the data pointers */ - btrfs_init_map_token(&token, leaf); for (i = slot; i < nritems; i++) { u32 ioff; - ioff = btrfs_token_item_offset(&token, i); - btrfs_set_token_item_offset(&token, i, ioff - data_size); + ioff = btrfs_item_offset(leaf, i); + btrfs_set_item_offset(leaf, i, ioff - data_size); } /* shift the data */ @@ -4164,7 +4163,6 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; struct extent_buffer *leaf; int slot; - struct btrfs_map_token token; u32 total_size; /* @@ -4192,7 +4190,6 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, BUG(); } - btrfs_init_map_token(&token, leaf); if (slot != nritems) { unsigned int old_data = btrfs_item_data_end(leaf, slot); @@ -4210,8 +4207,8 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, for (i = slot; i < nritems; i++) { u32 ioff; - ioff = btrfs_token_item_offset(&token, i); - btrfs_set_token_item_offset(&token, i, + ioff = btrfs_item_offset(leaf, i); + btrfs_set_item_offset(leaf, i, ioff - batch->total_data_size); } /* shift the items */ @@ -4228,8 +4225,8 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans, btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]); btrfs_set_item_key(leaf, &disk_key, slot + i); data_end -= batch->data_sizes[i]; - btrfs_set_token_item_offset(&token, slot + i, data_end); - btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]); + btrfs_set_item_offset(leaf, slot + i, data_end); + btrfs_set_item_size(leaf, slot + i, batch->data_sizes[i]); } btrfs_set_header_nritems(leaf, nritems + batch->nr); @@ -4442,7 +4439,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used_bytes(root); - atomic_inc(&leaf->refs); + refcount_inc(&leaf->refs); ret = btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); free_extent_buffer_stale(leaf); if (ret < 0) @@ -4469,7 +4466,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (slot + nr != nritems) { const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1); const int data_end = leaf_data_end(leaf); - struct btrfs_map_token token; u32 dsize = 0; int i; @@ -4479,12 +4475,11 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, memmove_leaf_data(leaf, data_end + dsize, data_end, last_off - data_end); - btrfs_init_map_token(&token, leaf); for (i = slot + nr; i < nritems; i++) { u32 ioff; - ioff = btrfs_token_item_offset(&token, i); - btrfs_set_token_item_offset(&token, i, ioff + dsize); + ioff = btrfs_item_offset(leaf, i); + btrfs_set_item_offset(leaf, i, ioff + dsize); } memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr); @@ -4527,7 +4522,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, * for possible call to btrfs_del_ptr below */ slot = path->slots[1]; - atomic_inc(&leaf->refs); + refcount_inc(&leaf->refs); /* * We want to be able to at least push one item to the * left neighbour leaf, and that's the first item. @@ -4585,16 +4580,13 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, /* * A helper function to walk down the tree starting at min_key, and looking - * for nodes or leaves that are have a minimum transaction id. + * for leaves that have a minimum transaction id. * This is used by the btree defrag code, and tree logging * * This does not cow, but it does stuff the starting key it finds back * into min_key, so you can call btrfs_search_slot with cow=1 on the * key and get a writable path. * - * This honors path->lowest_level to prevent descent past a given level - * of the tree. - * * min_trans indicates the oldest transaction that you are interested * in walking through. Any nodes or leaves older than min_trans are * skipped over (without reading them). @@ -4615,6 +4607,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, int keep_locks = path->keep_locks; ASSERT(!path->nowait); + ASSERT(path->lowest_level == 0); path->keep_locks = 1; again: cur = btrfs_read_lock_root_node(root); @@ -4636,8 +4629,8 @@ again: goto out; } - /* at the lowest level, we're done, setup the path and exit */ - if (level == path->lowest_level) { + /* At level 0 we're done, setup the path and exit. */ + if (level == 0) { if (slot >= nritems) goto find_next_key; ret = 0; @@ -4678,12 +4671,6 @@ find_next_key: goto out; } } - if (level == path->lowest_level) { - ret = 0; - /* Save our key for returning back. */ - btrfs_node_key_to_cpu(cur, min_key, slot); - goto out; - } cur = btrfs_read_node_slot(cur, slot); if (IS_ERR(cur)) { ret = PTR_ERR(cur); @@ -4699,7 +4686,7 @@ find_next_key: out: path->keep_locks = keep_locks; if (ret == 0) - btrfs_unlock_up_safe(path, path->lowest_level + 1); + btrfs_unlock_up_safe(path, 1); return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 71fa42ca04fe..fe70b593c7cd 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -224,16 +224,10 @@ struct btrfs_root { struct list_head root_list; - /* - * Xarray that keeps track of in-memory inodes, protected by the lock - * @inode_lock. - */ + /* Xarray that keeps track of in-memory inodes. */ struct xarray inodes; - /* - * Xarray that keeps track of delayed nodes of every inode, protected - * by @inode_lock. - */ + /* Xarray that keeps track of delayed nodes of every inode. */ struct xarray delayed_nodes; /* * right now this just gets used so that a root has its own devid @@ -508,7 +502,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) int __init btrfs_ctree_init(void); void __cold btrfs_ctree_exit(void); -int btrfs_bin_search(struct extent_buffer *eb, int first_slot, +int btrfs_bin_search(const struct extent_buffer *eb, int first_slot, const struct btrfs_key *key, int *slot); int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); @@ -576,9 +570,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer **cow_ret, u64 new_root_objectid); -bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf); +bool btrfs_block_can_be_shared(const struct btrfs_trans_handle *trans, + const struct btrfs_root *root, + const struct extent_buffer *buf); int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); void btrfs_extend_item(struct btrfs_trans_handle *trans, @@ -727,13 +721,18 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) } int btrfs_leaf_free_space(const struct extent_buffer *leaf); -static inline int is_fstree(u64 rootid) +static inline bool btrfs_is_fstree(u64 rootid) { - if (rootid == BTRFS_FS_TREE_OBJECTID || - ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID && - !btrfs_qgroup_level(rootid))) - return 1; - return 0; + if (rootid == BTRFS_FS_TREE_OBJECTID) + return true; + + if ((s64)rootid < (s64)BTRFS_FIRST_FREE_OBJECTID) + return false; + + if (btrfs_qgroup_level(rootid) != 0) + return false; + + return true; } static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 1831618579cb..738179a5e170 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -60,6 +60,14 @@ static int compare_inode_defrag(const struct inode_defrag *defrag1, return 0; } +static int inode_defrag_cmp(struct rb_node *new, const struct rb_node *existing) +{ + const struct inode_defrag *new_defrag = rb_entry(new, struct inode_defrag, rb_node); + const struct inode_defrag *existing_defrag = rb_entry(existing, struct inode_defrag, rb_node); + + return compare_inode_defrag(new_defrag, existing_defrag); +} + /* * Insert a record for an inode into the defrag tree. The lock must be held * already. @@ -71,37 +79,23 @@ static int btrfs_insert_inode_defrag(struct btrfs_inode *inode, struct inode_defrag *defrag) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct inode_defrag *entry; - struct rb_node **p; - struct rb_node *parent = NULL; - int ret; + struct rb_node *node; - p = &fs_info->defrag_inodes.rb_node; - while (*p) { - parent = *p; - entry = rb_entry(parent, struct inode_defrag, rb_node); + node = rb_find_add(&defrag->rb_node, &fs_info->defrag_inodes, inode_defrag_cmp); + if (node) { + struct inode_defrag *entry; - ret = compare_inode_defrag(defrag, entry); - if (ret < 0) - p = &parent->rb_left; - else if (ret > 0) - p = &parent->rb_right; - else { - /* - * If we're reinserting an entry for an old defrag run, - * make sure to lower the transid of our existing - * record. - */ - if (defrag->transid < entry->transid) - entry->transid = defrag->transid; - entry->extent_thresh = min(defrag->extent_thresh, - entry->extent_thresh); - return -EEXIST; - } + entry = rb_entry(node, struct inode_defrag, rb_node); + /* + * If we're reinserting an entry for an old defrag run, make + * sure to lower the transid of our existing record. + */ + if (defrag->transid < entry->transid) + entry->transid = defrag->transid; + entry->extent_thresh = min(defrag->extent_thresh, entry->extent_thresh); + return -EEXIST; } set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); - rb_link_node(&defrag->rb_node, parent, p); - rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); return 0; } @@ -854,8 +848,8 @@ static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t { struct address_space *mapping = inode->vfs_inode.i_mapping; gfp_t mask = btrfs_alloc_write_mask(mapping); - u64 folio_start; - u64 folio_end; + u64 lock_start; + u64 lock_end; struct extent_state *cached_state = NULL; struct folio *folio; int ret; @@ -891,15 +885,15 @@ again: return ERR_PTR(ret); } - folio_start = folio_pos(folio); - folio_end = folio_pos(folio) + folio_size(folio) - 1; + lock_start = folio_pos(folio); + lock_end = folio_end(folio) - 1; /* Wait for any existing ordered extent in the range */ while (1) { struct btrfs_ordered_extent *ordered; - btrfs_lock_extent(&inode->io_tree, folio_start, folio_end, &cached_state); - ordered = btrfs_lookup_ordered_range(inode, folio_start, folio_size(folio)); - btrfs_unlock_extent(&inode->io_tree, folio_start, folio_end, &cached_state); + btrfs_lock_extent(&inode->io_tree, lock_start, lock_end, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, lock_start, folio_size(folio)); + btrfs_unlock_extent(&inode->io_tree, lock_start, lock_end, &cached_state); if (!ordered) break; @@ -953,7 +947,7 @@ struct defrag_target_range { * @extent_thresh: file extent size threshold, any extent size >= this value * will be ignored * @newer_than: only defrag extents newer than this value - * @do_compress: whether the defrag is doing compression + * @do_compress: whether the defrag is doing compression or no-compression * if true, @extent_thresh will be ignored and all regular * file extents meeting @newer_than will be targets. * @locked: if the range has already held extent lock @@ -1184,8 +1178,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, if (!folio) break; - if (start >= folio_pos(folio) + folio_size(folio) || - start + len <= folio_pos(folio)) + if (start >= folio_end(folio) || start + len <= folio_pos(folio)) continue; btrfs_folio_clamp_clear_checked(fs_info, folio, start, len); btrfs_folio_clamp_set_dirty(fs_info, folio, start, len); @@ -1226,7 +1219,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, folios[i] = NULL; goto free_folios; } - cur = folio_pos(folios[i]) + folio_size(folios[i]); + cur = folio_end(folios[i]); } for (int i = 0; i < nr_pages; i++) { if (!folios[i]) @@ -1371,6 +1364,7 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, u64 cur; u64 last_byte; bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); + bool no_compress = (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS); int compress_type = BTRFS_COMPRESS_ZLIB; int compress_level = 0; int ret = 0; @@ -1401,6 +1395,9 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, if (range->compress_type) compress_type = range->compress_type; } + } else if (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS) { + compress_type = BTRFS_DEFRAG_DONT_COMPRESS; + compress_level = 1; } if (extent_thresh == 0) @@ -1451,13 +1448,14 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, btrfs_inode_unlock(inode, 0); break; } - if (do_compress) { + if (do_compress || no_compress) { inode->defrag_compress = compress_type; inode->defrag_compress_level = compress_level; } ret = defrag_one_cluster(inode, ra, cur, cluster_end + 1 - cur, extent_thresh, - newer_than, do_compress, §ors_defragged, + newer_than, do_compress || no_compress, + §ors_defragged, max_to_defrag, &last_scanned); if (sectors_defragged > prev_sectors_defragged) @@ -1496,7 +1494,7 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); ret = sectors_defragged; } - if (do_compress) { + if (do_compress || no_compress) { btrfs_inode_lock(inode, 0); inode->defrag_compress = BTRFS_COMPRESS_NONE; btrfs_inode_unlock(inode, 0); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index c7cc24a5dd5e..0f8d8e275143 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -334,6 +334,20 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len, return item; } +static int delayed_item_index_cmp(const void *key, const struct rb_node *node) +{ + const u64 *index = key; + const struct btrfs_delayed_item *delayed_item = rb_entry(node, + struct btrfs_delayed_item, rb_node); + + if (delayed_item->index < *index) + return 1; + else if (delayed_item->index > *index) + return -1; + + return 0; +} + /* * Look up the delayed item by key. * @@ -347,21 +361,10 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_item( struct rb_root *root, u64 index) { - struct rb_node *node = root->rb_node; - struct btrfs_delayed_item *delayed_item = NULL; + struct rb_node *node; - while (node) { - delayed_item = rb_entry(node, struct btrfs_delayed_item, - rb_node); - if (delayed_item->index < index) - node = node->rb_right; - else if (delayed_item->index > index) - node = node->rb_left; - else - return delayed_item; - } - - return NULL; + node = rb_find(&index, root, delayed_item_index_cmp); + return rb_entry_safe(node, struct btrfs_delayed_item, rb_node); } static int btrfs_delayed_item_cmp(const struct rb_node *new, @@ -369,14 +372,8 @@ static int btrfs_delayed_item_cmp(const struct rb_node *new, { const struct btrfs_delayed_item *new_item = rb_entry(new, struct btrfs_delayed_item, rb_node); - const struct btrfs_delayed_item *exist_item = - rb_entry(exist, struct btrfs_delayed_item, rb_node); - if (new_item->index < exist_item->index) - return -1; - if (new_item->index > exist_item->index) - return 1; - return 0; + return delayed_item_index_cmp(&new_item->index, exist); } static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, @@ -1008,8 +1005,16 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, ret = btrfs_lookup_inode(trans, root, path, &key, mod); if (ret > 0) ret = -ENOENT; - if (ret < 0) + if (ret < 0) { + /* + * If we fail to update the delayed inode we need to abort the + * transaction, because we could leave the inode with the + * improper counts behind. + */ + if (ret != -ENOENT) + btrfs_abort_transaction(trans, ret); goto out; + } leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], @@ -1034,8 +1039,10 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, btrfs_release_path(path); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) + if (ret < 0) { + btrfs_abort_transaction(trans, ret); goto err_out; + } ASSERT(ret > 0); ASSERT(path->slots[0] > 0); ret = 0; @@ -1057,21 +1064,14 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, * in the same item doesn't exist. */ ret = btrfs_del_item(trans, root, path); + if (ret < 0) + btrfs_abort_transaction(trans, ret); out: btrfs_release_delayed_iref(node); btrfs_release_path(path); err_out: btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0)); btrfs_release_delayed_inode(node); - - /* - * If we fail to update the delayed inode we need to abort the - * transaction, because we could leave the inode with the improper - * counts behind. - */ - if (ret && ret != -ENOENT) - btrfs_abort_transaction(trans, ret); - return ret; } @@ -1377,7 +1377,10 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info) { - WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root)); + struct btrfs_delayed_node *node = btrfs_first_delayed_node(fs_info->delayed_root); + + if (WARN_ON(node)) + refcount_dec(&node->refs); } static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) @@ -1537,8 +1540,8 @@ release_node: return ret; } -static int btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node, - u64 index) +static bool btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node, + u64 index) { struct btrfs_delayed_item *item; @@ -1546,7 +1549,7 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node, item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index); if (!item) { mutex_unlock(&node->mutex); - return 1; + return false; } /* @@ -1581,7 +1584,7 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node, } mutex_unlock(&node->mutex); - return 0; + return true; } int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, @@ -1595,9 +1598,10 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, if (IS_ERR(node)) return PTR_ERR(node); - ret = btrfs_delete_delayed_insertion_item(node, index); - if (!ret) + if (btrfs_delete_delayed_insertion_item(node, index)) { + ret = 0; goto end; + } item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM); if (!item) { @@ -1614,7 +1618,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, */ if (ret < 0) { btrfs_err(trans->fs_info, -"metadata reservation failed for delayed dir item deltiona, should have been reserved"); +"metadata reservation failed for delayed dir item deletion, index: %llu, root: %llu, inode: %llu, error: %d", + index, btrfs_root_id(node->root), node->inode_id, ret); btrfs_release_delayed_item(item); goto end; } @@ -1623,9 +1628,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, ret = __btrfs_add_delayed_item(node, item); if (unlikely(ret)) { btrfs_err(trans->fs_info, - "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", - index, btrfs_root_id(node->root), - node->inode_id, ret); +"failed to add delayed dir index item, root: %llu, inode: %llu, index: %llu, error: %d", + index, btrfs_root_id(node->root), node->inode_id, ret); btrfs_delayed_item_release_metadata(dir->root, item); btrfs_release_delayed_item(item); } @@ -1730,17 +1734,16 @@ void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode, downgrade_write(&inode->vfs_inode.i_rwsem); } -int btrfs_should_delete_dir_index(const struct list_head *del_list, - u64 index) +bool btrfs_should_delete_dir_index(const struct list_head *del_list, u64 index) { struct btrfs_delayed_item *curr; - int ret = 0; + bool ret = false; list_for_each_entry(curr, del_list, readdir_list) { if (curr->index > index) break; if (curr->index == index) { - ret = 1; + ret = true; break; } } @@ -1750,15 +1753,14 @@ int btrfs_should_delete_dir_index(const struct list_head *del_list, /* * Read dir info stored in the delayed tree. */ -int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - const struct list_head *ins_list) +bool btrfs_readdir_delayed_dir_index(struct dir_context *ctx, + const struct list_head *ins_list) { struct btrfs_dir_item *di; struct btrfs_delayed_item *curr, *next; struct btrfs_key location; char *name; int name_len; - int over = 0; unsigned char d_type; /* @@ -1767,6 +1769,8 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, * directory, nobody can delete any directory indexes now. */ list_for_each_entry_safe(curr, next, ins_list, readdir_list) { + bool over; + list_del(&curr->readdir_list); if (curr->index < ctx->pos) { @@ -1784,17 +1788,16 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, d_type = fs_ftype_to_dtype(btrfs_dir_flags_to_ftype(di->type)); btrfs_disk_key_to_cpu(&location, &di->location); - over = !dir_emit(ctx, name, name_len, - location.objectid, d_type); + over = !dir_emit(ctx, name, name_len, location.objectid, d_type); if (refcount_dec_and_test(&curr->refs)) kfree(curr); if (over) - return 1; + return true; ctx->pos++; } - return 0; + return false; } static void fill_stack_inode_item(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index c4b4ba122beb..e6e763ad2d42 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -150,10 +150,9 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode, void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode, struct list_head *ins_list, struct list_head *del_list); -int btrfs_should_delete_dir_index(const struct list_head *del_list, - u64 index); -int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - const struct list_head *ins_list); +bool btrfs_should_delete_dir_index(const struct list_head *del_list, u64 index); +bool btrfs_readdir_delayed_dir_index(struct dir_context *ctx, + const struct list_head *ins_list); /* Used during directory logging. */ void btrfs_log_get_delayed_items(struct btrfs_inode *inode, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 739c9e29aaa3..ca382c5b186f 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -928,7 +928,7 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; - if (is_fstree(generic_ref->ref_root)) + if (btrfs_is_fstree(generic_ref->ref_root)) seq = atomic64_read(&fs_info->tree_mod_seq); refcount_set(&ref->refs, 1); @@ -958,8 +958,8 @@ void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, #endif generic_ref->tree_ref.level = level; generic_ref->type = BTRFS_REF_METADATA; - if (skip_qgroup || !(is_fstree(generic_ref->ref_root) && - (!mod_root || is_fstree(mod_root)))) + if (skip_qgroup || !(btrfs_is_fstree(generic_ref->ref_root) && + (!mod_root || btrfs_is_fstree(mod_root)))) generic_ref->skip_qgroup = true; else generic_ref->skip_qgroup = false; @@ -976,8 +976,8 @@ void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset, generic_ref->data_ref.objectid = ino; generic_ref->data_ref.offset = offset; generic_ref->type = BTRFS_REF_DATA; - if (skip_qgroup || !(is_fstree(generic_ref->ref_root) && - (!mod_root || is_fstree(mod_root)))) + if (skip_qgroup || !(btrfs_is_fstree(generic_ref->ref_root) && + (!mod_root || btrfs_is_fstree(mod_root)))) generic_ref->skip_qgroup = true; else generic_ref->skip_qgroup = false; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 78cc23837610..552ec4fa645d 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -420,7 +420,7 @@ bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, u64 root, u64 parent); void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans); -static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node) +static inline u64 btrfs_delayed_ref_owner(const struct btrfs_delayed_ref_node *node) { if (node->type == BTRFS_EXTENT_DATA_REF_KEY || node->type == BTRFS_SHARED_DATA_REF_KEY) @@ -428,7 +428,7 @@ static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node) return node->tree_ref.level; } -static inline u64 btrfs_delayed_ref_offset(struct btrfs_delayed_ref_node *node) +static inline u64 btrfs_delayed_ref_offset(const struct btrfs_delayed_ref_node *node) { if (node->type == BTRFS_EXTENT_DATA_REF_KEY || node->type == BTRFS_SHARED_DATA_REF_KEY) @@ -436,7 +436,7 @@ static inline u64 btrfs_delayed_ref_offset(struct btrfs_delayed_ref_node *node) return 0; } -static inline u8 btrfs_ref_type(struct btrfs_ref *ref) +static inline u8 btrfs_ref_type(const struct btrfs_ref *ref) { ASSERT(ref->type == BTRFS_REF_DATA || ref->type == BTRFS_REF_METADATA); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 2decb9fff445..4675bcd5f92e 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -250,7 +250,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, } bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, - fs_info->bdev_holder, NULL); + fs_info->sb, &fs_holder_ops); if (IS_ERR(bdev_file)) { btrfs_err(fs_info, "target device %s is invalid!", device_path); return PTR_ERR(bdev_file); @@ -327,7 +327,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return 0; error: - fput(bdev_file); + bdev_fput(bdev_file); return ret; } @@ -600,7 +600,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return PTR_ERR(src_device); if (btrfs_pinned_by_swapfile(fs_info, src_device)) { - btrfs_warn_in_rcu(fs_info, + btrfs_warn(fs_info, "cannot replace device %s (devid %llu) due to active swapfile", btrfs_dev_name(src_device), src_device->devid); return -ETXTBSY; @@ -647,7 +647,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, dev_replace->srcdev = src_device; dev_replace->tgtdev = tgt_device; - btrfs_info_in_rcu(fs_info, + btrfs_info(fs_info, "dev_replace from %s (devid %llu) to %s started", btrfs_dev_name(src_device), src_device->devid, @@ -943,7 +943,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, tgt_device); } else { if (scrub_ret != -ECANCELED) - btrfs_err_in_rcu(fs_info, + btrfs_err(fs_info, "btrfs_scrub_dev(%s, %llu, %s) failed %d", btrfs_dev_name(src_device), src_device->devid, @@ -961,7 +961,7 @@ error: return scrub_ret; } - btrfs_info_in_rcu(fs_info, + btrfs_info(fs_info, "dev_replace from %s (devid %llu) to %s finished", btrfs_dev_name(src_device), src_device->devid, @@ -1109,7 +1109,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) * btrfs_dev_replace_finishing() will handle the * cleanup part */ - btrfs_info_in_rcu(fs_info, + btrfs_info(fs_info, "dev_replace from %s (devid %llu) to %s canceled", btrfs_dev_name(src_device), src_device->devid, btrfs_dev_name(tgt_device)); @@ -1143,7 +1143,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) ret = btrfs_commit_transaction(trans); WARN_ON(ret); - btrfs_info_in_rcu(fs_info, + btrfs_info(fs_info, "suspended dev_replace from %s (devid %llu) to %s canceled", btrfs_dev_name(src_device), src_device->devid, btrfs_dev_name(tgt_device)); @@ -1247,7 +1247,7 @@ static int btrfs_dev_replace_kthread(void *data) progress = btrfs_dev_replace_progress(fs_info); progress = div_u64(progress, 10); - btrfs_info_in_rcu(fs_info, + btrfs_info(fs_info, "continuing dev_replace from %s (devid %llu) to target %s @%u%%", btrfs_dev_name(dev_replace->srcdev), dev_replace->srcdev->devid, diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index b29cc31a7c4a..69863e398e22 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -227,7 +227,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, return di; } -int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir_ino, const struct fscrypt_str *name) { int ret; @@ -242,7 +242,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, if (!path) return -ENOMEM; - key.objectid = dir; + key.objectid = dir_ino; key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name->name, name->len); diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h index 8462579a95f4..e52174a8baf9 100644 --- a/fs/btrfs/dir-item.h +++ b/fs/btrfs/dir-item.h @@ -14,7 +14,7 @@ struct btrfs_inode; struct btrfs_root; struct btrfs_trans_handle; -int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir_ino, const struct fscrypt_str *name); int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const struct fscrypt_str *name, struct btrfs_inode *dir, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1beb9458f622..70fc4e7cc5a0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -884,7 +884,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_set_root_used(&root->root_item, leaf->len); btrfs_set_root_last_snapshot(&root->root_item, 0); btrfs_set_root_dirid(&root->root_item, 0); - if (is_fstree(objectid)) + if (btrfs_is_fstree(objectid)) generate_random_guid(root->root_item.uuid); else export_guid(root->root_item.uuid, &guid_null); @@ -1104,7 +1104,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) if (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && !btrfs_is_data_reloc_root(root) && - is_fstree(btrfs_root_id(root))) { + btrfs_is_fstree(btrfs_root_id(root))) { set_bit(BTRFS_ROOT_SHAREABLE, &root->state); btrfs_check_and_init_root_item(&root->root_item); } @@ -1113,7 +1113,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) * Don't assign anonymous block device to roots that are not exposed to * userspace, the id pool is limited to 1M */ - if (is_fstree(btrfs_root_id(root)) && + if (btrfs_is_fstree(btrfs_root_id(root)) && btrfs_root_refs(&root->root_item) > 0) { if (!anon_dev) { ret = get_anon_bdev(&root->anon_dev); @@ -1246,6 +1246,8 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { struct percpu_counter *em_counter = &fs_info->evictable_extent_maps; + if (fs_info->fs_devices) + btrfs_close_devices(fs_info->fs_devices); percpu_counter_destroy(&fs_info->stats_read_blocks); percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); @@ -1315,7 +1317,7 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, * This is namely for free-space-tree and quota tree, which can change * at runtime and should only be grabbed from fs_info. */ - if (!is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) + if (!btrfs_is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) return ERR_PTR(-ENOENT); again: root = btrfs_lookup_fs_root(fs_info, objectid); @@ -1835,6 +1837,8 @@ void btrfs_put_root(struct btrfs_root *root) if (refcount_dec_and_test(&root->refs)) { if (WARN_ON(!xa_empty(&root->inodes))) xa_destroy(&root->inodes); + if (WARN_ON(!xa_empty(&root->delayed_nodes))) + xa_destroy(&root->delayed_nodes); WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)); if (root->anon_dev) free_anon_bdev(root->anon_dev); @@ -1945,7 +1949,6 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) fs_info->qgroup_tree = RB_ROOT; INIT_LIST_HEAD(&fs_info->dirty_qgroups); fs_info->qgroup_seq = 1; - fs_info->qgroup_ulist = NULL; fs_info->qgroup_rescan_running = false; fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT; mutex_init(&fs_info->qgroup_rescan_lock); @@ -2026,14 +2029,10 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) fs_info->csum_shash = csum_shash; - /* - * Check if the checksum implementation is a fast accelerated one. - * As-is this is a bit of a hack and should be replaced once the csum - * implementations provide that information themselves. - */ + /* Check if the checksum implementation is a fast accelerated one. */ switch (csum_type) { case BTRFS_CSUM_TYPE_CRC32: - if (!strstr(crypto_shash_driver_name(csum_shash), "generic")) + if (crc32_optimizations() & CRC32C_OPTIMIZATION) set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); break; case BTRFS_CSUM_TYPE_XXHASH: @@ -2156,8 +2155,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, found = true; root = read_tree_root_path(tree_root, path, &key); if (IS_ERR(root)) { - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) - ret = PTR_ERR(root); + ret = PTR_ERR(root); break; } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); @@ -3395,6 +3393,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); fs_info->nodesize = nodesize; + fs_info->nodesize_bits = ilog2(nodesize); fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; @@ -3560,6 +3559,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_sysfs; } + btrfs_zoned_reserve_data_reloc_bg(fs_info); btrfs_free_zone_cache(fs_info); btrfs_check_active_zone_reservation(fs_info); @@ -3680,7 +3680,6 @@ fail_alloc: iput(fs_info->btree_inode); fail: - btrfs_close_devices(fs_info->fs_devices); ASSERT(ret < 0); return ret; } @@ -3693,7 +3692,7 @@ static void btrfs_end_super_write(struct bio *bio) bio_for_each_folio_all(fi, bio) { if (bio->bi_status) { - btrfs_warn_rl_in_rcu(device->fs_info, + btrfs_warn_rl(device->fs_info, "lost super block write due to IO error on %s (%d)", btrfs_dev_name(device), blk_status_to_errno(bio->bi_status)); @@ -3991,7 +3990,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) } if (min_tolerated == INT_MAX) { - pr_warn("BTRFS: unknown raid flag: %llu", flags); + btrfs_warn(NULL, "unknown raid flag: %llu", flags); min_tolerated = 0; } @@ -4310,8 +4309,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) * * So wait for all ongoing ordered extents to complete and then run * delayed iputs. This works because once we reach this point no one - * can either create new ordered extents nor create delayed iputs - * through some other means. + * can create new ordered extents, but delayed iputs can still be added + * by a reclaim worker (see comments further below). * * Also note that btrfs_wait_ordered_roots() is not safe here, because * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent, @@ -4322,15 +4321,29 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_flush_workqueue(fs_info->endio_write_workers); /* Ordered extents for free space inodes. */ btrfs_flush_workqueue(fs_info->endio_freespace_worker); + /* + * Run delayed iputs in case an async reclaim worker is waiting for them + * to be run as mentioned above. + */ btrfs_run_delayed_iputs(fs_info); - /* There should be no more workload to generate new delayed iputs. */ - set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state); cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); cancel_work_sync(&fs_info->em_shrinker_work); + /* + * Run delayed iputs again because an async reclaim worker may have + * added new ones if it was flushing delalloc: + * + * shrink_delalloc() -> btrfs_start_delalloc_roots() -> + * start_delalloc_inodes() -> btrfs_add_delayed_iput() + */ + btrfs_run_delayed_iputs(fs_info); + + /* There should be no more workload to generate new delayed iputs. */ + set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state); + /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); @@ -4413,7 +4426,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) iput(fs_info->btree_inode); btrfs_mapping_tree_free(fs_info); - btrfs_close_devices(fs_info->fs_devices); } void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, @@ -4625,7 +4637,7 @@ static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end, mark, NULL)) { - btrfs_clear_extent_bits(dirty_pages, start, end, mark); + btrfs_clear_extent_bit(dirty_pages, start, end, mark, NULL); while (start <= end) { eb = find_extent_buffer(fs_info, start); start += fs_info->nodesize; diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index b1b96eb5f64e..66361325f6dc 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -43,7 +43,8 @@ static inline void btrfs_extent_state_leak_debug_check(void) while (!list_empty(&states)) { state = list_first_entry(&states, struct extent_state, leak_list); - pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", + btrfs_err(NULL, + "state leak: start %llu end %llu state %u in tree %d refs %d", state->start, state->end, state->state, extent_state_in_tree(state), refcount_read(&state->refs)); @@ -1882,12 +1883,11 @@ int btrfs_clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 e bool btrfs_try_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_state **cached) { - int err; + int ret; u64 failed_start; - err = set_extent_bit(tree, start, end, bits, &failed_start, NULL, - cached, NULL); - if (err == -EEXIST) { + ret = set_extent_bit(tree, start, end, bits, &failed_start, NULL, cached, NULL); + if (ret == -EEXIST) { if (failed_start > start) btrfs_clear_extent_bit(tree, start, failed_start - 1, bits, cached); @@ -1904,21 +1904,21 @@ int btrfs_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 struct extent_state **cached_state) { struct extent_state *failed_state = NULL; - int err; + int ret; u64 failed_start; - err = set_extent_bit(tree, start, end, bits, &failed_start, + ret = set_extent_bit(tree, start, end, bits, &failed_start, &failed_state, cached_state, NULL); - while (err == -EEXIST) { + while (ret == -EEXIST) { if (failed_start != start) btrfs_clear_extent_bit(tree, start, failed_start - 1, bits, cached_state); wait_extent_bit(tree, failed_start, end, bits, &failed_state); - err = set_extent_bit(tree, start, end, bits, &failed_start, + ret = set_extent_bit(tree, start, end, bits, &failed_start, &failed_state, cached_state, NULL); } - return err; + return ret; } /* diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 0a18ca9c59c3..36facca37973 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -19,7 +19,8 @@ enum { ENUM_BIT(EXTENT_DIRTY), ENUM_BIT(EXTENT_LOCKED), ENUM_BIT(EXTENT_DIO_LOCKED), - ENUM_BIT(EXTENT_NEW), + ENUM_BIT(EXTENT_DIRTY_LOG1), + ENUM_BIT(EXTENT_DIRTY_LOG2), ENUM_BIT(EXTENT_DELALLOC), ENUM_BIT(EXTENT_DEFRAG), ENUM_BIT(EXTENT_BOUNDARY), @@ -191,12 +192,6 @@ static inline int btrfs_unlock_extent(struct extent_io_tree *tree, u64 start, u6 cached, NULL); } -static inline int btrfs_clear_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, u32 bits) -{ - return btrfs_clear_extent_bit(tree, start, end, bits, NULL); -} - int btrfs_set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset); int btrfs_set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cb6128778a83..97d517cdf2df 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -46,7 +46,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extra_op); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, @@ -56,12 +56,12 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 flags, u64 owner, u64 offset, struct btrfs_key *ins, int ref_mod, u64 oref_root); static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op); -static int find_next_key(struct btrfs_path *path, int level, +static int find_next_key(const struct btrfs_path *path, int level, struct btrfs_key *key); -static int block_group_bits(struct btrfs_block_group *cache, u64 bits) +static int block_group_bits(const struct btrfs_block_group *cache, u64 bits) { return (cache->flags & bits) == bits; } @@ -329,7 +329,7 @@ search_again: * is_data == BTRFS_REF_TYPE_ANY, either type is OK. */ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, - struct btrfs_extent_inline_ref *iref, + const struct btrfs_extent_inline_ref *iref, enum btrfs_inline_ref_type is_data) { struct btrfs_fs_info *fs_info = eb->fs_info; @@ -401,16 +401,16 @@ u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) return ((u64)high_crc << 31) ^ (u64)low_crc; } -static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, - struct btrfs_extent_data_ref *ref) +static u64 hash_extent_data_ref_item(const struct extent_buffer *leaf, + const struct btrfs_extent_data_ref *ref) { return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), btrfs_extent_data_ref_objectid(leaf, ref), btrfs_extent_data_ref_offset(leaf, ref)); } -static bool match_extent_data_ref(struct extent_buffer *leaf, - struct btrfs_extent_data_ref *ref, +static bool match_extent_data_ref(const struct extent_buffer *leaf, + const struct btrfs_extent_data_ref *ref, u64 root_objectid, u64 owner, u64 offset) { if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || @@ -497,7 +497,7 @@ fail: static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, u64 bytenr) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); @@ -617,13 +617,13 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, return ret; } -static noinline u32 extent_data_ref_count(struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref) +static noinline u32 extent_data_ref_count(const struct btrfs_path *path, + const struct btrfs_extent_inline_ref *iref) { struct btrfs_key key; struct extent_buffer *leaf; - struct btrfs_extent_data_ref *ref1; - struct btrfs_shared_data_ref *ref2; + const struct btrfs_extent_data_ref *ref1; + const struct btrfs_shared_data_ref *ref2; u32 num_refs = 0; int type; @@ -638,10 +638,10 @@ static noinline u32 extent_data_ref_count(struct btrfs_path *path, type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); ASSERT(type != BTRFS_REF_TYPE_INVALID); if (type == BTRFS_EXTENT_DATA_REF_KEY) { - ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); + ref1 = (const struct btrfs_extent_data_ref *)(&iref->offset); num_refs = btrfs_extent_data_ref_count(leaf, ref1); } else { - ref2 = (struct btrfs_shared_data_ref *)(iref + 1); + ref2 = (const struct btrfs_shared_data_ref *)(iref + 1); num_refs = btrfs_shared_data_ref_count(leaf, ref2); } } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { @@ -684,7 +684,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, u64 bytenr) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); @@ -722,7 +722,7 @@ static inline int extent_ref_type(u64 parent, u64 owner) return type; } -static int find_next_key(struct btrfs_path *path, int level, +static int find_next_key(const struct btrfs_path *path, int level, struct btrfs_key *key) { @@ -1480,7 +1480,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, * */ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { BTRFS_PATH_AUTO_FREE(path); @@ -1522,19 +1522,21 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_release_path(path); /* now insert the actual backref */ - if (owner < BTRFS_FIRST_FREE_OBJECTID) + if (owner < BTRFS_FIRST_FREE_OBJECTID) { ret = insert_tree_block_ref(trans, path, node, bytenr); - else + if (ret) + btrfs_abort_transaction(trans, ret); + } else { ret = insert_extent_data_ref(trans, path, node, bytenr); - - if (ret) - btrfs_abort_transaction(trans, ret); + if (ret) + btrfs_abort_transaction(trans, ret); + } return ret; } static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_head *href) + const struct btrfs_delayed_ref_head *href) { u64 root = href->owning_root; @@ -1543,7 +1545,7 @@ static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info, * where it has already been unset. */ if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE || - !href->is_data || !is_fstree(root)) + !href->is_data || !btrfs_is_fstree(root)) return; btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes, @@ -1552,7 +1554,7 @@ static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info, static int run_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { @@ -1620,7 +1622,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, } static int run_delayed_extent_op(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head, + const struct btrfs_delayed_ref_head *head, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -1707,7 +1709,7 @@ again: static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { @@ -1754,7 +1756,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, /* helper function to actually process a single delayed ref entry */ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { @@ -2998,7 +3000,7 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, return ret; } - ret = add_to_free_space_tree(trans, bytenr, num_bytes); + ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -3079,7 +3081,7 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, */ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *info = trans->fs_info; @@ -3649,6 +3651,21 @@ btrfs_release_block_group(struct btrfs_block_group *cache, btrfs_put_block_group(cache); } +static bool find_free_extent_check_size_class(const struct find_free_extent_ctl *ffe_ctl, + const struct btrfs_block_group *bg) +{ + if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) + return true; + if (!btrfs_block_group_should_use_size_class(bg)) + return true; + if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) + return true; + if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && + bg->size_class == BTRFS_BG_SZ_NONE) + return true; + return ffe_ctl->size_class == bg->size_class; +} + /* * Helper function for find_free_extent(). * @@ -3670,7 +3687,8 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg, if (!cluster_bg) goto refill_cluster; if (cluster_bg != bg && (cluster_bg->ro || - !block_group_bits(cluster_bg, ffe_ctl->flags))) + !block_group_bits(cluster_bg, ffe_ctl->flags) || + !find_free_extent_check_size_class(ffe_ctl, cluster_bg))) goto release_cluster; offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, @@ -4227,21 +4245,6 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return -ENOSPC; } -static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl, - struct btrfs_block_group *bg) -{ - if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) - return true; - if (!btrfs_block_group_should_use_size_class(bg)) - return true; - if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) - return true; - if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && - bg->size_class == BTRFS_BG_SZ_NONE) - return true; - return ffe_ctl->size_class == bg->size_class; -} - static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, @@ -4782,7 +4785,7 @@ static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, struct btrfs_fs_info *fs_info = trans->fs_info; int ret; - ret = remove_from_free_space_tree(trans, bytenr, num_bytes); + ret = btrfs_remove_from_free_space_tree(trans, bytenr, num_bytes); if (ret) return ret; @@ -4873,7 +4876,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, } static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, + const struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -4961,7 +4964,7 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID); - if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root)) + if (btrfs_is_data_reloc_root(root) && btrfs_is_fstree(root->relocation_src_root)) generic_ref.owning_root = root->relocation_src_root; btrfs_init_data_ref(&generic_ref, owner, offset, 0, false); @@ -4983,7 +4986,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, int ret; struct btrfs_block_group *block_group; struct btrfs_space_info *space_info; - struct btrfs_squota_delta delta = { + const struct btrfs_squota_delta delta = { .root = root_objectid, .num_bytes = ins->offset, .generation = trans->transid, @@ -5111,11 +5114,11 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (buf->log_index == 0) btrfs_set_extent_bit(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, - EXTENT_DIRTY, NULL); + EXTENT_DIRTY_LOG1, NULL); else btrfs_set_extent_bit(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, - EXTENT_NEW, NULL); + EXTENT_DIRTY_LOG2, NULL); } else { buf->log_index = -1; btrfs_set_extent_bit(&trans->transaction->dirty_pages, buf->start, @@ -5552,7 +5555,7 @@ again: goto again; } - exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent); + exists = btrfs_find_delayed_tree_ref(head, btrfs_root_id(root), parent); mutex_unlock(&head->mutex); out: spin_unlock(&delayed_refs->lock); @@ -5872,15 +5875,20 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (wc->refs[level] == 1) { if (level == 0) { - if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { ret = btrfs_dec_ref(trans, root, eb, 1); - else + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } else { ret = btrfs_dec_ref(trans, root, eb, 0); - if (ret) { - btrfs_abort_transaction(trans, ret); - return ret; + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } } - if (is_fstree(btrfs_root_id(root))) { + if (btrfs_is_fstree(btrfs_root_id(root))) { ret = btrfs_qgroup_trace_leaf_items(trans, eb); if (ret) { btrfs_err_rl(fs_info, @@ -6341,7 +6349,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, btrfs_assert_tree_write_locked(parent); parent_level = btrfs_header_level(parent); - atomic_inc(&parent->refs); + refcount_inc(&parent->refs); path->nodes[parent_level] = parent; path->slots[parent_level] = btrfs_header_nritems(parent); @@ -6442,7 +6450,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) /* Check if there are any CHUNK_* bits left */ if (start > device->total_bytes) { DEBUG_WARN(); - btrfs_warn_in_rcu(fs_info, + btrfs_warn(fs_info, "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu", start, end - start + 1, btrfs_dev_name(device), diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 72914074c304..82d3a82dc712 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -97,7 +97,7 @@ enum btrfs_inline_ref_type { }; int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, - struct btrfs_extent_inline_ref *iref, + const struct btrfs_extent_inline_ref *iref, enum btrfs_inline_ref_type is_data); u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 849199768664..835b0deef9bb 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -75,9 +75,9 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) while (!list_empty(&fs_info->allocated_ebs)) { eb = list_first_entry(&fs_info->allocated_ebs, struct extent_buffer, leak_list); - pr_err( - "BTRFS: buffer leak start %llu len %u refs %d bflags %lu owner %llu\n", - eb->start, eb->len, atomic_read(&eb->refs), eb->bflags, + btrfs_err(fs_info, + "buffer leak start %llu len %u refs %d bflags %lu owner %llu", + eb->start, eb->len, refcount_read(&eb->refs), eb->bflags, btrfs_header_owner(eb)); list_del(&eb->leak_list); WARN_ON_ONCE(1); @@ -110,6 +110,7 @@ struct btrfs_bio_ctrl { * This is to avoid touching ranges covered by compression/inline. */ unsigned long submit_bitmap; + struct readahead_control *ractl; }; static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) @@ -266,8 +267,7 @@ static noinline int lock_delalloc_folios(struct inode *inode, goto out; } range_start = max_t(u64, folio_pos(folio), start); - range_len = min_t(u64, folio_pos(folio) + folio_size(folio), - end + 1) - range_start; + range_len = min_t(u64, folio_end(folio), end + 1) - range_start; btrfs_folio_set_lock(fs_info, folio, range_start, range_len); processed_end = range_start + range_len - 1; @@ -321,7 +321,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, ASSERT(orig_end > orig_start); /* The range should at least cover part of the folio */ - ASSERT(!(orig_start >= folio_pos(locked_folio) + folio_size(locked_folio) || + ASSERT(!(orig_start >= folio_end(locked_folio) || orig_end <= folio_pos(locked_folio))); again: /* step one, find a bunch of delalloc bytes starting at start */ @@ -419,7 +419,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); ASSERT(folio_pos(folio) <= start && - start + len <= folio_pos(folio) + folio_size(folio)); + start + len <= folio_end(folio)); if (uptodate && btrfs_verify_folio(folio, start, len)) btrfs_folio_set_uptodate(fs_info, folio, start, len); @@ -782,7 +782,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, static int attach_extent_buffer_folio(struct extent_buffer *eb, struct folio *folio, - struct btrfs_subpage *prealloc) + struct btrfs_folio_state *prealloc) { struct btrfs_fs_info *fs_info = eb->fs_info; int ret = 0; @@ -806,7 +806,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb, /* Already mapped, just free prealloc */ if (folio_test_private(folio)) { - btrfs_free_subpage(prealloc); + btrfs_free_folio_state(prealloc); return 0; } @@ -815,7 +815,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb, folio_attach_private(folio, prealloc); else /* Do new allocation to attach subpage */ - ret = btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); + ret = btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); return ret; } @@ -831,7 +831,7 @@ int set_folio_extent_mapped(struct folio *folio) fs_info = folio_to_fs_info(folio); if (btrfs_is_subpage(fs_info, folio)) - return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); + return btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA); folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE); return 0; @@ -848,7 +848,7 @@ void clear_folio_extent_mapped(struct folio *folio) fs_info = folio_to_fs_info(folio); if (btrfs_is_subpage(fs_info, folio)) - return btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); + return btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA); folio_detach_private(folio); } @@ -882,6 +882,25 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode, return em; } + +static void btrfs_readahead_expand(struct readahead_control *ractl, + const struct extent_map *em) +{ + const u64 ra_pos = readahead_pos(ractl); + const u64 ra_end = ra_pos + readahead_length(ractl); + const u64 em_end = em->start + em->ram_bytes; + + /* No expansion for holes and inline extents. */ + if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE) + return; + + ASSERT(em_end >= ra_pos, + "extent_map %llu %llu ends before current readahead position %llu", + em->start, em->len, ra_pos); + if (em_end > ra_end) + readahead_expand(ractl, ra_pos, em_end - ra_pos); +} + /* * basic readpage implementation. Locked extent state structs are inserted * into the tree that are removed when the IO is done (by the end_io @@ -945,6 +964,16 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, compress_type = btrfs_extent_map_compression(em); + /* + * Only expand readahead for extents which are already creating + * the pages anyway in add_ra_bio_pages, which is compressed + * extents in the non subpage case. + */ + if (bio_ctrl->ractl && + !btrfs_is_subpage(fs_info, folio) && + compress_type != BTRFS_COMPRESS_NONE) + btrfs_readahead_expand(bio_ctrl->ractl, em); + if (compress_type != BTRFS_COMPRESS_NONE) disk_bytenr = em->disk_bytenr; else @@ -1086,7 +1115,7 @@ static bool can_skip_one_ordered_range(struct btrfs_inode *inode, * finished our folio read and unlocked the folio. */ if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) { - u64 range_len = min(folio_pos(folio) + folio_size(folio), + u64 range_len = min(folio_end(folio), ordered->file_offset + ordered->num_bytes) - cur; ret = true; @@ -1108,7 +1137,7 @@ static bool can_skip_one_ordered_range(struct btrfs_inode *inode, * So we return true and update @next_ret to the OE/folio boundary. */ if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { - u64 range_len = min(folio_pos(folio) + folio_size(folio), + u64 range_len = min(folio_end(folio), ordered->file_offset + ordered->num_bytes) - cur; /* @@ -1663,7 +1692,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl int ret; size_t pg_offset; loff_t i_size = i_size_read(&inode->vfs_inode); - unsigned long end_index = i_size >> PAGE_SHIFT; + const pgoff_t end_index = i_size >> PAGE_SHIFT; const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc); @@ -1704,7 +1733,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); btrfs_err_rl(fs_info, "root %lld ino %llu folio %llu is marked dirty without notifying the fs", - inode->root->root_key.objectid, + btrfs_root_id(inode->root), btrfs_ino(inode), folio_pos(folio)); ret = -EUCLEAN; goto done; @@ -1774,7 +1803,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e */ spin_lock(&eb->refs_lock); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { - XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits); + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); unsigned long flags; set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); @@ -1874,7 +1903,7 @@ static void set_btree_ioerr(struct extent_buffer *eb) static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark) { struct btrfs_fs_info *fs_info = eb->fs_info; - XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits); + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); unsigned long flags; xas_lock_irqsave(&xas, flags); @@ -1886,7 +1915,7 @@ static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark) static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark) { struct btrfs_fs_info *fs_info = eb->fs_info; - XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits); + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); unsigned long flags; xas_lock_irqsave(&xas, flags); @@ -1961,7 +1990,7 @@ retry: if (!eb) return NULL; - if (!atomic_inc_not_zero(&eb->refs)) { + if (!refcount_inc_not_zero(&eb->refs)) { xas_reset(xas); goto retry; } @@ -1986,7 +2015,7 @@ static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info, rcu_read_lock(); while ((eb = find_get_eb(&xas, end, tag)) != NULL) { if (!eb_batch_add(batch, eb)) { - *start = ((eb->start + eb->len) >> fs_info->sectorsize_bits); + *start = ((eb->start + eb->len) >> fs_info->nodesize_bits); goto out; } } @@ -2008,11 +2037,11 @@ static struct extent_buffer *find_extent_buffer_nolock( struct btrfs_fs_info *fs_info, u64 start) { struct extent_buffer *eb; - unsigned long index = (start >> fs_info->sectorsize_bits); + unsigned long index = (start >> fs_info->nodesize_bits); rcu_read_lock(); eb = xa_load(&fs_info->buffer_tree, index); - if (eb && !atomic_inc_not_zero(&eb->refs)) + if (eb && !refcount_inc_not_zero(&eb->refs)) eb = NULL; rcu_read_unlock(); return eb; @@ -2031,10 +2060,7 @@ static void end_bbio_meta_write(struct btrfs_bio *bbio) } buffer_tree_clear_mark(eb, PAGECACHE_TAG_WRITEBACK); - clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); - smp_mb__after_atomic(); - wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); - + clear_and_wake_up_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); bio_put(&bbio->bio); } @@ -2085,7 +2111,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; u64 range_start = max_t(u64, eb->start, folio_pos(folio)); - u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio), + u32 range_len = min_t(u64, folio_end(folio), eb->start + eb->len) - range_start; folio_lock(folio); @@ -2114,8 +2140,8 @@ void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, u64 end) { struct eb_batch batch; - unsigned long start_index = (start >> fs_info->sectorsize_bits); - unsigned long end_index = (end >> fs_info->sectorsize_bits); + unsigned long start_index = (start >> fs_info->nodesize_bits); + unsigned long end_index = (end >> fs_info->nodesize_bits); eb_batch_init(&batch); while (start_index <= end_index) { @@ -2151,7 +2177,7 @@ int btree_write_cache_pages(struct address_space *mapping, eb_batch_init(&batch); if (wbc->range_cyclic) { - index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits); + index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->nodesize_bits); end = -1; /* @@ -2160,8 +2186,8 @@ int btree_write_cache_pages(struct address_space *mapping, */ scanned = (index == 0); } else { - index = (wbc->range_start >> fs_info->sectorsize_bits); - end = (wbc->range_end >> fs_info->sectorsize_bits); + index = (wbc->range_start >> fs_info->nodesize_bits); + end = (wbc->range_end >> fs_info->nodesize_bits); scanned = 1; } @@ -2489,7 +2515,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f continue; } - cur_end = min_t(u64, folio_pos(folio) + folio_size(folio) - 1, end); + cur_end = min_t(u64, folio_end(folio) - 1, end); cur_len = cur_end + 1 - cur; ASSERT(folio_test_locked(folio)); @@ -2541,7 +2567,10 @@ int btrfs_writepages(struct address_space *mapping, struct writeback_control *wb void btrfs_readahead(struct readahead_control *rac) { - struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; + struct btrfs_bio_ctrl bio_ctrl = { + .opf = REQ_OP_READ | REQ_RAHEAD, + .ractl = rac + }; struct folio *folio; struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); const u64 start = readahead_pos(rac); @@ -2731,13 +2760,13 @@ static int extent_buffer_under_io(const struct extent_buffer *eb) static bool folio_range_has_eb(struct folio *folio) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; lockdep_assert_held(&folio->mapping->i_private_lock); if (folio_test_private(folio)) { - subpage = folio_get_private(folio); - if (atomic_read(&subpage->eb_refs)) + bfs = folio_get_private(folio); + if (atomic_read(&bfs->eb_refs)) return true; } return false; @@ -2787,7 +2816,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * attached to one dummy eb, no sharing. */ if (!mapped) { - btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); + btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); return; } @@ -2798,7 +2827,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * page range and no unfinished IO. */ if (!folio_range_has_eb(folio)) - btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); + btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA); spin_unlock(&mapping->i_private_lock); } @@ -2842,7 +2871,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info btrfs_leak_debug_add_eb(eb); spin_lock_init(&eb->refs_lock); - atomic_set(&eb->refs, 1); + refcount_set(&eb->refs, 1); ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE); @@ -2975,13 +3004,13 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) * once io is initiated, TREE_REF can no longer be cleared, so that is * the moment at which any such race is best fixed. */ - refs = atomic_read(&eb->refs); + refs = refcount_read(&eb->refs); if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) return; spin_lock(&eb->refs_lock); if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) - atomic_inc(&eb->refs); + refcount_inc(&eb->refs); spin_unlock(&eb->refs_lock); } @@ -3038,7 +3067,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, eb->fs_info = fs_info; again: xa_lock_irq(&fs_info->buffer_tree); - exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits, + exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->nodesize_bits, NULL, eb, GFP_NOFS); if (xa_is_err(exists)) { ret = xa_err(exists); @@ -3047,7 +3076,7 @@ again: return ERR_PTR(ret); } if (exists) { - if (!atomic_inc_not_zero(&exists->refs)) { + if (!refcount_inc_not_zero(&exists->refs)) { /* The extent buffer is being freed, retry. */ xa_unlock_irq(&fs_info->buffer_tree); goto again; @@ -3092,7 +3121,7 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, * just overwrite folio private. */ exists = folio_get_private(folio); - if (atomic_inc_not_zero(&exists->refs)) + if (refcount_inc_not_zero(&exists->refs)) return exists; WARN_ON(folio_test_dirty(folio)); @@ -3141,13 +3170,13 @@ static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) * The caller needs to free the existing folios and retry using the same order. */ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, - struct btrfs_subpage *prealloc, + struct btrfs_folio_state *prealloc, struct extent_buffer **found_eb_ret) { struct btrfs_fs_info *fs_info = eb->fs_info; struct address_space *mapping = fs_info->btree_inode->i_mapping; - const unsigned long index = eb->start >> PAGE_SHIFT; + const pgoff_t index = eb->start >> PAGE_SHIFT; struct folio *existing_folio; int ret; @@ -3224,7 +3253,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, int attached = 0; struct extent_buffer *eb; struct extent_buffer *existing_eb = NULL; - struct btrfs_subpage *prealloc = NULL; + struct btrfs_folio_state *prealloc = NULL; u64 lockdep_owner = owner_root; bool page_contig = true; int uptodate = 1; @@ -3269,7 +3298,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * manually if we exit earlier. */ if (btrfs_meta_is_subpage(fs_info)) { - prealloc = btrfs_alloc_subpage(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA); + prealloc = btrfs_alloc_folio_state(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA); if (IS_ERR(prealloc)) { ret = PTR_ERR(prealloc); goto out; @@ -3280,7 +3309,7 @@ reallocate: /* Allocate all pages first. */ ret = alloc_eb_folio_array(eb, true); if (ret < 0) { - btrfs_free_subpage(prealloc); + btrfs_free_folio_state(prealloc); goto out; } @@ -3354,7 +3383,7 @@ reallocate: again: xa_lock_irq(&fs_info->buffer_tree); existing_eb = __xa_cmpxchg(&fs_info->buffer_tree, - start >> fs_info->sectorsize_bits, NULL, eb, + start >> fs_info->nodesize_bits, NULL, eb, GFP_NOFS); if (xa_is_err(existing_eb)) { ret = xa_err(existing_eb); @@ -3362,7 +3391,7 @@ again: goto out; } if (existing_eb) { - if (!atomic_inc_not_zero(&existing_eb->refs)) { + if (!refcount_inc_not_zero(&existing_eb->refs)) { xa_unlock_irq(&fs_info->buffer_tree); goto again; } @@ -3391,7 +3420,7 @@ again: return eb; out: - WARN_ON(!atomic_dec_and_test(&eb->refs)); + WARN_ON(!refcount_dec_and_test(&eb->refs)); /* * Any attached folios need to be detached before we unlock them. This @@ -3437,8 +3466,7 @@ static int release_extent_buffer(struct extent_buffer *eb) { lockdep_assert_held(&eb->refs_lock); - WARN_ON(atomic_read(&eb->refs) == 0); - if (atomic_dec_and_test(&eb->refs)) { + if (refcount_dec_and_test(&eb->refs)) { struct btrfs_fs_info *fs_info = eb->fs_info; spin_unlock(&eb->refs_lock); @@ -3458,7 +3486,7 @@ static int release_extent_buffer(struct extent_buffer *eb) * in this case. */ xa_cmpxchg_irq(&fs_info->buffer_tree, - eb->start >> fs_info->sectorsize_bits, eb, NULL, + eb->start >> fs_info->nodesize_bits, eb, NULL, GFP_ATOMIC); btrfs_leak_debug_del_eb(eb); @@ -3484,22 +3512,26 @@ void free_extent_buffer(struct extent_buffer *eb) if (!eb) return; - refs = atomic_read(&eb->refs); + refs = refcount_read(&eb->refs); while (1) { - if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3) - || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && - refs == 1)) + if (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) { + if (refs == 1) + break; + } else if (refs <= 3) { break; - if (atomic_try_cmpxchg(&eb->refs, &refs, refs - 1)) + } + + /* Optimization to avoid locking eb->refs_lock. */ + if (atomic_try_cmpxchg(&eb->refs.refs, &refs, refs - 1)) return; } spin_lock(&eb->refs_lock); - if (atomic_read(&eb->refs) == 2 && + if (refcount_read(&eb->refs) == 2 && test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && !extent_buffer_under_io(eb) && test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) - atomic_dec(&eb->refs); + refcount_dec(&eb->refs); /* * I know this is terrible, but it's temporary until we stop tracking @@ -3516,9 +3548,9 @@ void free_extent_buffer_stale(struct extent_buffer *eb) spin_lock(&eb->refs_lock); set_bit(EXTENT_BUFFER_STALE, &eb->bflags); - if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && + if (refcount_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) - atomic_dec(&eb->refs); + refcount_dec(&eb->refs); release_extent_buffer(eb); } @@ -3576,7 +3608,7 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, btree_clear_folio_dirty_tag(folio); folio_unlock(folio); } - WARN_ON(atomic_read(&eb->refs) == 0); + WARN_ON(refcount_read(&eb->refs) == 0); } void set_extent_buffer_dirty(struct extent_buffer *eb) @@ -3587,7 +3619,7 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - WARN_ON(atomic_read(&eb->refs) == 0); + WARN_ON(refcount_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); @@ -3646,9 +3678,7 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) static void clear_extent_buffer_reading(struct extent_buffer *eb) { - clear_bit(EXTENT_BUFFER_READING, &eb->bflags); - smp_mb__after_atomic(); - wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING); + clear_and_wake_up_bit(EXTENT_BUFFER_READING, &eb->bflags); } static void end_bbio_meta_read(struct btrfs_bio *bbio) @@ -3713,7 +3743,7 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, eb->read_mirror = 0; check_buffer_tree_ref(eb); - atomic_inc(&eb->refs); + refcount_inc(&eb->refs); bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, REQ_OP_READ | REQ_META, eb->fs_info, @@ -3725,7 +3755,7 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; u64 range_start = max_t(u64, eb->start, folio_pos(folio)); - u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio), + u32 range_len = min_t(u64, folio_end(folio), eb->start + eb->len) - range_start; bio_add_folio_nofail(&bbio->bio, folio, range_len, @@ -4104,8 +4134,8 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb, * @start: offset of the bitmap item in the extent buffer * @nr: bit number to test */ -int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, - unsigned long nr) +bool extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, + unsigned long nr) { unsigned long i; size_t offset; @@ -4296,9 +4326,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio) { struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); struct extent_buffer *eb; - unsigned long start = (folio_pos(folio) >> fs_info->sectorsize_bits); + unsigned long start = (folio_pos(folio) >> fs_info->nodesize_bits); unsigned long index = start; - unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1; + unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1; int ret; xa_lock_irq(&fs_info->buffer_tree); @@ -4308,11 +4338,10 @@ static int try_release_subpage_extent_buffer(struct folio *folio) * won't disappear out from under us. */ spin_lock(&eb->refs_lock); - if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { + if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); continue; } - xa_unlock_irq(&fs_info->buffer_tree); /* * If tree ref isn't set then we know the ref on this eb is a @@ -4329,6 +4358,7 @@ static int try_release_subpage_extent_buffer(struct folio *folio) * check the folio private at the end. And * release_extent_buffer() will release the refs_lock. */ + xa_unlock_irq(&fs_info->buffer_tree); release_extent_buffer(eb); xa_lock_irq(&fs_info->buffer_tree); } @@ -4374,7 +4404,7 @@ int try_release_extent_buffer(struct folio *folio) * this page. */ spin_lock(&eb->refs_lock); - if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { + if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); spin_unlock(&folio->mapping->i_private_lock); return 0; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index e36e8d6a00bc..61130786b9a3 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -98,7 +98,7 @@ struct extent_buffer { void *addr; spinlock_t refs_lock; - atomic_t refs; + refcount_t refs; int read_mirror; /* >= 0 if eb belongs to a log tree, -1 otherwise */ s8 log_index; @@ -345,8 +345,8 @@ void memmove_extent_buffer(const struct extent_buffer *dst, unsigned long len); void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, unsigned long len); -int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, - unsigned long pos); +bool extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, + unsigned long pos); void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len); void extent_buffer_bitmap_clear(const struct extent_buffer *eb, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 02bfdb976e40..57f52585a6dd 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -84,7 +84,7 @@ static void remove_em(struct btrfs_inode *inode, struct extent_map *em) rb_erase(&em->rb_node, &inode->extent_tree.root); RB_CLEAR_NODE(&em->rb_node); - if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root))) + if (!btrfs_is_testing(fs_info) && btrfs_is_fstree(btrfs_root_id(inode->root))) percpu_counter_dec(&fs_info->evictable_extent_maps); } @@ -502,7 +502,7 @@ static int add_extent_mapping(struct btrfs_inode *inode, setup_extent_mapping(inode, em, modified); - if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(root))) + if (!btrfs_is_testing(fs_info) && btrfs_is_fstree(btrfs_root_id(root))) percpu_counter_inc(&fs_info->evictable_extent_maps); return 0; @@ -1337,7 +1337,7 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work) if (!root) continue; - if (is_fstree(btrfs_root_id(root))) + if (btrfs_is_fstree(btrfs_root_id(root))) nr_dropped += btrfs_scan_root(root, &ctx); btrfs_put_root(root); diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c index 43bf0979fd53..7935586a9dbd 100644 --- a/fs/btrfs/fiemap.c +++ b/fs/btrfs/fiemap.c @@ -320,7 +320,7 @@ static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *p * the cost of allocating a new one. */ ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags)); - atomic_inc(&clone->refs); + refcount_inc(&clone->refs); ret = btrfs_next_leaf(inode->root, path); if (ret != 0) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 54d523d4f421..c09fbc257634 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -427,7 +427,7 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) memset(csum_dst, 0, csum_size); count = 1; - if (btrfs_root_id(inode->root) == BTRFS_DATA_RELOC_TREE_OBJECTID) { + if (btrfs_is_data_reloc_root(inode->root)) { u64 file_offset = bbio->file_offset + bio_offset; btrfs_set_extent_bit(&inode->io_tree, file_offset, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8ce6f45f45e0..204674934795 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -89,8 +89,7 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos num_bytes = round_up(write_bytes + pos - start_pos, fs_info->sectorsize); ASSERT(num_bytes <= U32_MAX); - ASSERT(folio_pos(folio) <= pos && - folio_pos(folio) + folio_size(folio) >= pos + write_bytes); + ASSERT(folio_pos(folio) <= pos && folio_end(folio) >= pos + write_bytes); end_of_last_block = start_pos + num_bytes - 1; @@ -801,7 +800,7 @@ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 u64 len) { u64 clamp_start = max_t(u64, pos, folio_pos(folio)); - u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio)); + u64 clamp_end = min_t(u64, pos + len, folio_end(folio)); const u32 blocksize = inode_to_fs_info(inode)->sectorsize; int ret = 0; @@ -857,7 +856,7 @@ static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ loff_t pos, size_t write_bytes, bool nowait) { - unsigned long index = pos >> PAGE_SHIFT; + const pgoff_t index = pos >> PAGE_SHIFT; gfp_t mask = get_prepare_gfp_flags(inode, nowait); fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) | fgf_set_order(write_bytes); @@ -963,6 +962,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, * @pos: File offset. * @write_bytes: The length to write, will be updated to the nocow writeable * range. + * @nowait: Indicate if we can block or not (non-blocking IO context). * * This function will flush ordered extents in the range to ensure proper * nocow checks. @@ -971,7 +971,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, * > 0 If we can nocow, and updates @write_bytes. * 0 If we can't do a nocow write. * -EAGAIN If we can't do a nocow write because snapshoting of the inode's - * root is in progress. + * root is in progress or because we are in a non-blocking IO + * context and need to block (@nowait is true). * < 0 If an error happened. * * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0. @@ -983,8 +984,8 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, struct btrfs_root *root = inode->root; struct extent_state *cached_state = NULL; u64 lockstart, lockend; - u64 num_bytes; - int ret; + u64 cur_offset; + int ret = 0; if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) return 0; @@ -995,7 +996,6 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, lockstart = round_down(pos, fs_info->sectorsize); lockend = round_up(pos + *write_bytes, fs_info->sectorsize) - 1; - num_bytes = lockend - lockstart + 1; if (nowait) { if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend, @@ -1007,14 +1007,36 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, &cached_state); } - ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, nowait); - if (ret <= 0) - btrfs_drew_write_unlock(&root->snapshot_lock); - else - *write_bytes = min_t(size_t, *write_bytes , - num_bytes - pos + lockstart); + + cur_offset = lockstart; + while (cur_offset < lockend) { + u64 num_bytes = lockend - cur_offset + 1; + + ret = can_nocow_extent(inode, cur_offset, &num_bytes, NULL, nowait); + if (ret <= 0) { + /* + * If cur_offset == lockstart it means we haven't found + * any extent against which we can NOCOW, so unlock the + * snapshot lock. + */ + if (cur_offset == lockstart) + btrfs_drew_write_unlock(&root->snapshot_lock); + break; + } + cur_offset += num_bytes; + } + btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + /* + * cur_offset > lockstart means there's at least a partial range we can + * NOCOW, and that range can cover one or more extents. + */ + if (cur_offset > lockstart) { + *write_bytes = min_t(size_t, *write_bytes, cur_offset - pos); + return 1; + } + return ret; } @@ -1233,8 +1255,8 @@ again: * The reserved range goes beyond the current folio, shrink the reserved * space to the folio boundary. */ - if (reserved_start + reserved_len > folio_pos(folio) + folio_size(folio)) { - const u64 last_block = folio_pos(folio) + folio_size(folio); + if (reserved_start + reserved_len > folio_end(folio)) { + const u64 last_block = folio_end(folio); shrink_reserved_space(inode, *data_reserved, reserved_start, reserved_len, last_block - reserved_start, @@ -1832,9 +1854,9 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct folio *folio = page_folio(page); - struct inode *inode = file_inode(vmf->vma->vm_file); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_inode *inode = BTRFS_I(file_inode(vmf->vma->vm_file)); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; @@ -1842,6 +1864,7 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) loff_t size; size_t fsize = folio_size(folio); int ret; + bool only_release_metadata = false; u64 reserved_space; u64 page_start; u64 page_end; @@ -1849,7 +1872,7 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) reserved_space = fsize; - sb_start_pagefault(inode->i_sb); + sb_start_pagefault(inode->vfs_inode.i_sb); page_start = folio_pos(folio); page_end = page_start + folio_size(folio) - 1; end = page_end; @@ -1862,20 +1885,43 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - page_start, reserved_space); - if (ret < 0) + ret = btrfs_check_data_free_space(inode, &data_reserved, page_start, + reserved_space, false); + if (ret < 0) { + size_t write_bytes = reserved_space; + + if (btrfs_check_nocow_lock(inode, page_start, &write_bytes, false) <= 0) + goto out_noreserve; + + only_release_metadata = true; + + /* + * Can't write the whole range, there may be shared extents or + * holes in the range, bail out with @only_release_metadata set + * to true so that we unlock the nocow lock before returning the + * error. + */ + if (write_bytes < reserved_space) + goto out_noreserve; + } + ret = btrfs_delalloc_reserve_metadata(inode, reserved_space, + reserved_space, false); + if (ret < 0) { + if (!only_release_metadata) + btrfs_free_reserved_data_space(inode, data_reserved, + page_start, reserved_space); goto out_noreserve; + } ret = file_update_time(vmf->vma->vm_file); if (ret < 0) goto out; again: - down_read(&BTRFS_I(inode)->i_mmap_lock); + down_read(&inode->i_mmap_lock); folio_lock(folio); - size = i_size_read(inode); + size = i_size_read(&inode->vfs_inode); - if ((folio->mapping != inode->i_mapping) || + if ((folio->mapping != inode->vfs_inode.i_mapping) || (page_start >= size)) { /* Page got truncated out from underneath us. */ goto out_unlock; @@ -1893,11 +1939,11 @@ again: * We can't set the delalloc bits if there are pending ordered * extents. Drop our locks and wait for them to finish. */ - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, fsize); + ordered = btrfs_lookup_ordered_range(inode, page_start, fsize); if (ordered) { btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); folio_unlock(folio); - up_read(&BTRFS_I(inode)->i_mmap_lock); + up_read(&inode->i_mmap_lock); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; @@ -1906,10 +1952,14 @@ again: if (folio_contains(folio, (size - 1) >> PAGE_SHIFT)) { reserved_space = round_up(size - page_start, fs_info->sectorsize); if (reserved_space < fsize) { + const u64 to_free = fsize - reserved_space; + end = page_start + reserved_space - 1; - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, end + 1, - fsize - reserved_space, true); + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, to_free, true); + else + btrfs_delalloc_release_space(inode, data_reserved, + end + 1, to_free, true); } } @@ -1920,12 +1970,11 @@ again: * clear any delalloc bits within this page range since we have to * reserve data&meta space before lock_page() (see above comments). */ - btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, + btrfs_clear_extent_bit(io_tree, page_start, end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, &cached_state); - ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, - &cached_state); + ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state); if (ret < 0) { btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); goto out_unlock; @@ -1944,26 +1993,38 @@ again: btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); - btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); + btrfs_set_inode_last_sub_trans(inode); + + if (only_release_metadata) + btrfs_set_extent_bit(io_tree, page_start, end, EXTENT_NORESERVE, + &cached_state); btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); - up_read(&BTRFS_I(inode)->i_mmap_lock); + up_read(&inode->i_mmap_lock); - btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); - sb_end_pagefault(inode->i_sb); + btrfs_delalloc_release_extents(inode, fsize); + if (only_release_metadata) + btrfs_check_nocow_unlock(inode); + sb_end_pagefault(inode->vfs_inode.i_sb); extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; out_unlock: folio_unlock(folio); - up_read(&BTRFS_I(inode)->i_mmap_lock); + up_read(&inode->i_mmap_lock); out: - btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, - reserved_space, true); + btrfs_delalloc_release_extents(inode, fsize); + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, reserved_space, true); + else + btrfs_delalloc_release_space(inode, data_reserved, page_start, + reserved_space, true); extent_changeset_free(data_reserved); out_noreserve: - sb_end_pagefault(inode->i_sb); + if (only_release_metadata) + btrfs_check_nocow_unlock(inode); + + sb_end_pagefault(inode->vfs_inode.i_sb); if (ret < 0) return vmf_error(ret); @@ -1978,15 +2039,16 @@ static const struct vm_operations_struct btrfs_file_vm_ops = { .page_mkwrite = btrfs_page_mkwrite, }; -static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) +static int btrfs_file_mmap_prepare(struct vm_area_desc *desc) { + struct file *filp = desc->file; struct address_space *mapping = filp->f_mapping; if (!mapping->a_ops->read_folio) return -ENOEXEC; file_accessed(filp); - vma->vm_ops = &btrfs_file_vm_ops; + desc->vm_ops = &btrfs_file_vm_ops; return 0; } @@ -2195,7 +2257,7 @@ static bool check_range_has_page(struct inode *inode, u64 start, u64 end) if (folio->index < start_index) continue; /* A large folio extends beyond the end. Not a target. */ - if (folio->index + folio_nr_pages(folio) > end_index) + if (folio_next_index(folio) > end_index) continue; /* A folio doesn't cover the head/tail index. Found a target. */ ret = true; @@ -2341,7 +2403,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1); u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize); struct btrfs_trans_handle *trans = NULL; - struct btrfs_block_rsv *rsv; + struct btrfs_block_rsv rsv; unsigned int rsv_count; u64 cur_offset; u64 len = end - start; @@ -2350,13 +2412,9 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, if (end <= start) return -EINVAL; - rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); - if (!rsv) { - ret = -ENOMEM; - goto out; - } - rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1); - rsv->failfast = true; + btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP); + rsv.size = btrfs_calc_insert_metadata_size(fs_info, 1); + rsv.failfast = true; /* * 1 - update the inode @@ -2373,14 +2431,14 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; - goto out_free; + goto out_release; } - ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, + ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv, min_size, false); if (WARN_ON(ret)) goto out_trans; - trans->block_rsv = rsv; + trans->block_rsv = &rsv; cur_offset = start; drop_args.path = path; @@ -2496,10 +2554,10 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, } ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, - rsv, min_size, false); + &rsv, min_size, false); if (WARN_ON(ret)) break; - trans->block_rsv = rsv; + trans->block_rsv = &rsv; cur_offset = drop_args.drop_end; len = end - cur_offset; @@ -2576,16 +2634,15 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, out_trans: if (!trans) - goto out_free; + goto out_release; trans->block_rsv = &fs_info->trans_block_rsv; if (ret) btrfs_end_transaction(trans); else *trans_out = trans; -out_free: - btrfs_free_block_rsv(fs_info, rsv); -out: +out_release: + btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL); return ret; } @@ -3765,7 +3822,7 @@ const struct file_operations btrfs_file_operations = { .splice_read = filemap_splice_read, .write_iter = btrfs_file_write_iter, .splice_write = iter_file_splice_write, - .mmap = btrfs_file_mmap, + .mmap_prepare = btrfs_file_mmap_prepare, .open = btrfs_file_open, .release = btrfs_release_file, .get_unmapped_area = thp_get_unmapped_area, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 4b34ea1f01c2..5d8d1570a5c9 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -366,7 +366,7 @@ fail: static void readahead_cache(struct inode *inode) { struct file_ra_state ra; - unsigned long last_index; + pgoff_t last_index; file_ra_state_init(&ra, inode->i_mapping); last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; @@ -3192,7 +3192,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, u64 *max_extent_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - int err; + int ret2; u64 search_start = cluster->window_start; u64 search_bytes = bytes; u64 ret = 0; @@ -3200,8 +3200,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, search_start = min_start; search_bytes = bytes; - err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); - if (err) { + ret2 = search_bitmap(ctl, entry, &search_start, &search_bytes, true); + if (ret2) { *max_extent_size = max(get_max_extent_size(entry), *max_extent_size); return 0; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 0c573d46639a..eba7f22ae49c 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -35,7 +35,7 @@ static struct btrfs_root *btrfs_free_space_root( return btrfs_global_root(block_group->fs_info, &key); } -void set_free_space_tree_thresholds(struct btrfs_block_group *cache) +void btrfs_set_free_space_tree_thresholds(struct btrfs_block_group *cache) { u32 bitmap_range; size_t bitmap_size; @@ -82,22 +82,19 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info)); if (ret) - goto out; + return ret; leaf = path->nodes[0]; info = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_free_space_info); btrfs_set_free_space_extent_count(leaf, info, 0); btrfs_set_free_space_flags(leaf, info, 0); - - ret = 0; -out: btrfs_release_path(path); - return ret; + return 0; } EXPORT_FOR_TESTS -struct btrfs_free_space_info *search_free_space_info( +struct btrfs_free_space_info *btrfs_search_free_space_info( struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path, int cow) @@ -201,9 +198,9 @@ static void le_bitmap_set(unsigned long *map, unsigned int start, int len) } EXPORT_FOR_TESTS -int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path) +int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = btrfs_free_space_root(block_group); @@ -281,7 +278,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, btrfs_release_path(path); } - info = search_free_space_info(trans, block_group, path, 1); + info = btrfs_search_free_space_info(trans, block_group, path, 1); if (IS_ERR(info)) { ret = PTR_ERR(info); btrfs_abort_transaction(trans, ret); @@ -290,6 +287,8 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; flags = btrfs_free_space_flags(leaf, info); flags |= BTRFS_FREE_SPACE_USING_BITMAPS; + block_group->using_free_space_bitmaps = true; + block_group->using_free_space_bitmaps_cached = true; btrfs_set_free_space_flags(leaf, info, flags); expected_extent_count = btrfs_free_space_extent_count(leaf, info); btrfs_release_path(path); @@ -343,9 +342,9 @@ out: } EXPORT_FOR_TESTS -int convert_free_space_to_extents(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path) +int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = btrfs_free_space_root(block_group); @@ -409,12 +408,12 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, data_size = free_space_bitmap_size(fs_info, found_key.offset); - ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1); + path->slots[0]--; + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); read_extent_buffer(leaf, bitmap_cursor, ptr, data_size); nr++; - path->slots[0]--; } else { ASSERT(0); } @@ -428,7 +427,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, btrfs_release_path(path); } - info = search_free_space_info(trans, block_group, path, 1); + info = btrfs_search_free_space_info(trans, block_group, path, 1); if (IS_ERR(info)) { ret = PTR_ERR(info); btrfs_abort_transaction(trans, ret); @@ -437,20 +436,22 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; flags = btrfs_free_space_flags(leaf, info); flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS; + block_group->using_free_space_bitmaps = false; + block_group->using_free_space_bitmaps_cached = true; btrfs_set_free_space_flags(leaf, info, flags); expected_extent_count = btrfs_free_space_extent_count(leaf, info); btrfs_release_path(path); - nrbits = block_group->length >> block_group->fs_info->sectorsize_bits; + nrbits = block_group->length >> fs_info->sectorsize_bits; start_bit = find_next_bit_le(bitmap, nrbits, 0); while (start_bit < nrbits) { end_bit = find_next_zero_bit_le(bitmap, nrbits, start_bit); ASSERT(start_bit < end_bit); - key.objectid = start + start_bit * block_group->fs_info->sectorsize; + key.objectid = start + start_bit * fs_info->sectorsize; key.type = BTRFS_FREE_SPACE_EXTENT_KEY; - key.offset = (end_bit - start_bit) * block_group->fs_info->sectorsize; + key.offset = (end_bit - start_bit) * fs_info->sectorsize; ret = btrfs_insert_empty_item(trans, root, path, &key, 0); if (ret) { @@ -493,11 +494,10 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans, if (new_extents == 0) return 0; - info = search_free_space_info(trans, block_group, path, 1); - if (IS_ERR(info)) { - ret = PTR_ERR(info); - goto out; - } + info = btrfs_search_free_space_info(trans, block_group, path, 1); + if (IS_ERR(info)) + return PTR_ERR(info); + flags = btrfs_free_space_flags(path->nodes[0], info); extent_count = btrfs_free_space_extent_count(path->nodes[0], info); @@ -507,19 +507,18 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans, if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) && extent_count > block_group->bitmap_high_thresh) { - ret = convert_free_space_to_bitmaps(trans, block_group, path); + ret = btrfs_convert_free_space_to_bitmaps(trans, block_group, path); } else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) && extent_count < block_group->bitmap_low_thresh) { - ret = convert_free_space_to_extents(trans, block_group, path); + ret = btrfs_convert_free_space_to_extents(trans, block_group, path); } -out: return ret; } EXPORT_FOR_TESTS -int free_space_test_bit(struct btrfs_block_group *block_group, - struct btrfs_path *path, u64 offset) +bool btrfs_free_space_test_bit(struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 offset) { struct extent_buffer *leaf; struct btrfs_key key; @@ -537,13 +536,13 @@ int free_space_test_bit(struct btrfs_block_group *block_group, ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); i = div_u64(offset - found_start, block_group->fs_info->sectorsize); - return !!extent_buffer_test_bit(leaf, ptr, i); + return extent_buffer_test_bit(leaf, ptr, i); } -static void free_space_set_bits(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path, u64 *start, u64 *size, - int bit) +static void free_space_modify_bits(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 *start, u64 *size, + bool set_bits) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct extent_buffer *leaf; @@ -567,7 +566,7 @@ static void free_space_set_bits(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); first = (*start - found_start) >> fs_info->sectorsize_bits; last = (end - found_start) >> fs_info->sectorsize_bits; - if (bit) + if (set_bits) extent_buffer_bitmap_set(leaf, ptr, first, last - first); else extent_buffer_bitmap_clear(leaf, ptr, first, last - first); @@ -611,13 +610,14 @@ static int free_space_next_bitmap(struct btrfs_trans_handle *trans, static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path, - u64 start, u64 size, int remove) + u64 start, u64 size, bool remove) { struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_key key; u64 end = start + size; u64 cur_start, cur_size; - int prev_bit, next_bit; + bool prev_bit_set = false; + bool next_bit_set = false; int new_extents; int ret; @@ -634,16 +634,16 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1); if (ret) - goto out; + return ret; - prev_bit = free_space_test_bit(block_group, path, prev_block); + prev_bit_set = btrfs_free_space_test_bit(block_group, path, prev_block); /* The previous block may have been in the previous bitmap. */ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (start >= key.objectid + key.offset) { ret = free_space_next_bitmap(trans, root, path); if (ret) - goto out; + return ret; } } else { key.objectid = start; @@ -652,9 +652,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1); if (ret) - goto out; - - prev_bit = -1; + return ret; } /* @@ -664,13 +662,13 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, cur_start = start; cur_size = size; while (1) { - free_space_set_bits(trans, block_group, path, &cur_start, &cur_size, - !remove); + free_space_modify_bits(trans, block_group, path, &cur_start, + &cur_size, !remove); if (cur_size == 0) break; ret = free_space_next_bitmap(trans, root, path); if (ret) - goto out; + return ret; } /* @@ -683,42 +681,36 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, if (end >= key.objectid + key.offset) { ret = free_space_next_bitmap(trans, root, path); if (ret) - goto out; + return ret; } - next_bit = free_space_test_bit(block_group, path, end); - } else { - next_bit = -1; + next_bit_set = btrfs_free_space_test_bit(block_group, path, end); } if (remove) { new_extents = -1; - if (prev_bit == 1) { + if (prev_bit_set) { /* Leftover on the left. */ new_extents++; } - if (next_bit == 1) { + if (next_bit_set) { /* Leftover on the right. */ new_extents++; } } else { new_extents = 1; - if (prev_bit == 1) { + if (prev_bit_set) { /* Merging with neighbor on the left. */ new_extents--; } - if (next_bit == 1) { + if (next_bit_set) { /* Merging with neighbor on the right. */ new_extents--; } } btrfs_release_path(path); - ret = update_free_space_extent_count(trans, block_group, path, - new_extents); - -out: - return ret; + return update_free_space_extent_count(trans, block_group, path, new_extents); } static int remove_free_space_extent(struct btrfs_trans_handle *trans, @@ -739,7 +731,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans, ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); if (ret) - goto out; + return ret; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); @@ -771,7 +763,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans, /* Delete the existing key (cases 1-4). */ ret = btrfs_del_item(trans, root, path); if (ret) - goto out; + return ret; /* Add a key for leftovers at the beginning (cases 3 and 4). */ if (start > found_start) { @@ -782,7 +774,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); ret = btrfs_insert_empty_item(trans, root, path, &key, 0); if (ret) - goto out; + return ret; new_extents++; } @@ -795,50 +787,58 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); ret = btrfs_insert_empty_item(trans, root, path, &key, 0); if (ret) - goto out; + return ret; new_extents++; } btrfs_release_path(path); - ret = update_free_space_extent_count(trans, block_group, path, - new_extents); - -out: - return ret; + return update_free_space_extent_count(trans, block_group, path, new_extents); } -EXPORT_FOR_TESTS -int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path, u64 start, u64 size) +static int using_bitmaps(struct btrfs_block_group *bg, struct btrfs_path *path) { struct btrfs_free_space_info *info; u32 flags; - int ret; - if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { - ret = __add_block_group_free_space(trans, block_group, path); - if (ret) - return ret; - } + if (bg->using_free_space_bitmaps_cached) + return bg->using_free_space_bitmaps; - info = search_free_space_info(NULL, block_group, path, 0); + info = btrfs_search_free_space_info(NULL, bg, path, 0); if (IS_ERR(info)) return PTR_ERR(info); flags = btrfs_free_space_flags(path->nodes[0], info); btrfs_release_path(path); - if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + bg->using_free_space_bitmaps = (flags & BTRFS_FREE_SPACE_USING_BITMAPS); + bg->using_free_space_bitmaps_cached = true; + + return bg->using_free_space_bitmaps; +} + +EXPORT_FOR_TESTS +int __btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 start, u64 size) +{ + int ret; + + ret = __add_block_group_free_space(trans, block_group, path); + if (ret) + return ret; + + ret = using_bitmaps(block_group, path); + if (ret < 0) + return ret; + + if (ret) return modify_free_space_bitmap(trans, block_group, path, - start, size, 1); - } else { - return remove_free_space_extent(trans, block_group, path, - start, size); - } + start, size, true); + + return remove_free_space_extent(trans, block_group, path, start, size); } -int remove_from_free_space_tree(struct btrfs_trans_handle *trans, - u64 start, u64 size) +int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, + u64 start, u64 size) { struct btrfs_block_group *block_group; struct btrfs_path *path; @@ -863,8 +863,7 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans, } mutex_lock(&block_group->free_space_lock); - ret = __remove_from_free_space_tree(trans, block_group, path, start, - size); + ret = __btrfs_remove_from_free_space_tree(trans, block_group, path, start, size); mutex_unlock(&block_group->free_space_lock); if (ret) btrfs_abort_transaction(trans, ret); @@ -918,7 +917,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans, ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); if (ret) - goto out; + return ret; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); @@ -941,7 +940,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans, if (found_end == start) { ret = btrfs_del_item(trans, root, path); if (ret) - goto out; + return ret; new_key.objectid = found_start; new_key.offset += key.offset; new_extents--; @@ -958,7 +957,7 @@ right: ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); if (ret) - goto out; + return ret; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); @@ -982,7 +981,7 @@ right: if (found_start == end) { ret = btrfs_del_item(trans, root, path); if (ret) - goto out; + return ret; new_key.offset += key.offset; new_extents--; } @@ -992,48 +991,36 @@ insert: /* Insert the new key (cases 1-4). */ ret = btrfs_insert_empty_item(trans, root, path, &new_key, 0); if (ret) - goto out; + return ret; btrfs_release_path(path); - ret = update_free_space_extent_count(trans, block_group, path, - new_extents); - -out: - return ret; + return update_free_space_extent_count(trans, block_group, path, new_extents); } EXPORT_FOR_TESTS -int __add_to_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path, u64 start, u64 size) +int __btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 start, u64 size) { - struct btrfs_free_space_info *info; - u32 flags; int ret; - if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { - ret = __add_block_group_free_space(trans, block_group, path); - if (ret) - return ret; - } + ret = __add_block_group_free_space(trans, block_group, path); + if (ret) + return ret; - info = search_free_space_info(NULL, block_group, path, 0); - if (IS_ERR(info)) - return PTR_ERR(info); - flags = btrfs_free_space_flags(path->nodes[0], info); - btrfs_release_path(path); + ret = using_bitmaps(block_group, path); + if (ret < 0) + return ret; - if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + if (ret) return modify_free_space_bitmap(trans, block_group, path, - start, size, 0); - } else { - return add_free_space_extent(trans, block_group, path, start, - size); - } + start, size, false); + + return add_free_space_extent(trans, block_group, path, start, size); } -int add_to_free_space_tree(struct btrfs_trans_handle *trans, - u64 start, u64 size) +int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, + u64 start, u64 size) { struct btrfs_block_group *block_group; struct btrfs_path *path; @@ -1058,7 +1045,7 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans, } mutex_lock(&block_group->free_space_lock); - ret = __add_to_free_space_tree(trans, block_group, path, start, size); + ret = __btrfs_add_to_free_space_tree(trans, block_group, path, start, size); mutex_unlock(&block_group->free_space_lock); if (ret) btrfs_abort_transaction(trans, ret); @@ -1115,11 +1102,21 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0); if (ret < 0) goto out_locked; - ASSERT(ret == 0); + /* + * If ret is 1 (no key found), it means this is an empty block group, + * without any extents allocated from it and there's no block group + * item (key BTRFS_BLOCK_GROUP_ITEM_KEY) located in the extent tree + * because we are using the block group tree feature, so block group + * items are stored in the block group tree. It also means there are no + * extents allocated for block groups with a start offset beyond this + * block group's end offset (this is the last, highest, block group). + */ + if (!btrfs_fs_compat_ro(trans->fs_info, BLOCK_GROUP_TREE)) + ASSERT(ret == 0); start = block_group->start; end = block_group->start + block_group->length; - while (1) { + while (ret == 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.type == BTRFS_EXTENT_ITEM_KEY || @@ -1128,11 +1125,11 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, break; if (start < key.objectid) { - ret = __add_to_free_space_tree(trans, - block_group, - path2, start, - key.objectid - - start); + ret = __btrfs_add_to_free_space_tree(trans, + block_group, + path2, start, + key.objectid - + start); if (ret) goto out_locked; } @@ -1149,12 +1146,10 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, ret = btrfs_next_item(extent_root, path); if (ret < 0) goto out_locked; - if (ret) - break; } if (start < end) { - ret = __add_to_free_space_tree(trans, block_group, path2, - start, end - start); + ret = __btrfs_add_to_free_space_tree(trans, block_group, path2, + start, end - start); if (ret) goto out_locked; } @@ -1233,6 +1228,7 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, { BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; + struct rb_node *node; int nr; int ret; @@ -1261,6 +1257,16 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, btrfs_release_path(path); } + node = rb_first_cached(&trans->fs_info->block_group_cache_tree); + while (node) { + struct btrfs_block_group *bg; + + bg = rb_entry(node, struct btrfs_block_group, cache_node); + clear_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &bg->runtime_flags); + node = rb_next(node); + cond_resched(); + } + return 0; } @@ -1350,12 +1356,18 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) block_group = rb_entry(node, struct btrfs_block_group, cache_node); + + if (test_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, + &block_group->runtime_flags)) + goto next; + ret = populate_free_space_tree(trans, block_group); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; } +next: if (btrfs_should_end_transaction(trans)) { btrfs_end_transaction(trans); trans = btrfs_start_transaction(free_space_root, 1); @@ -1378,51 +1390,79 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path) { + bool own_path = false; int ret; - clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags); + if (!test_and_clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, + &block_group->runtime_flags)) + return 0; + + /* + * While rebuilding the free space tree we may allocate new metadata + * block groups while modifying the free space tree. + * + * Because during the rebuild (at btrfs_rebuild_free_space_tree()) we + * can use multiple transactions, every time btrfs_end_transaction() is + * called at btrfs_rebuild_free_space_tree() we finish the creation of + * new block groups by calling btrfs_create_pending_block_groups(), and + * that in turn calls us, through add_block_group_free_space(), to add + * a free space info item and a free space extent item for the block + * group. + * + * Then later btrfs_rebuild_free_space_tree() may find such new block + * groups and processes them with populate_free_space_tree(), which can + * fail with EEXIST since there are already items for the block group in + * the free space tree. Notice that we say "may find" because a new + * block group may be added to the block groups rbtree in a node before + * or after the block group currently being processed by the rebuild + * process. So signal the rebuild process to skip such new block groups + * if it finds them. + */ + set_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &block_group->runtime_flags); + + if (!path) { + path = btrfs_alloc_path(); + if (!path) { + btrfs_abort_transaction(trans, -ENOMEM); + return -ENOMEM; + } + own_path = true; + } ret = add_new_free_space_info(trans, block_group, path); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + ret = __btrfs_add_to_free_space_tree(trans, block_group, path, + block_group->start, block_group->length); if (ret) - return ret; + btrfs_abort_transaction(trans, ret); - return __add_to_free_space_tree(trans, block_group, path, - block_group->start, - block_group->length); +out: + if (own_path) + btrfs_free_path(path); + + return ret; } -int add_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group) +int btrfs_add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group) { - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_path *path = NULL; - int ret = 0; + int ret; - if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) return 0; mutex_lock(&block_group->free_space_lock); - if (!test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) - goto out; - - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - - ret = __add_block_group_free_space(trans, block_group, path); - -out: - btrfs_free_path(path); + ret = __add_block_group_free_space(trans, block_group, NULL); mutex_unlock(&block_group->free_space_lock); - if (ret) - btrfs_abort_transaction(trans, ret); return ret; } -int remove_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group) +int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group) { struct btrfs_root *root = btrfs_free_space_root(block_group); struct btrfs_path *path; @@ -1443,6 +1483,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } @@ -1455,8 +1496,10 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } leaf = path->nodes[0]; nr = 0; @@ -1484,16 +1527,16 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } btrfs_release_path(path); } ret = 0; out: btrfs_free_path(path); - if (ret) - btrfs_abort_transaction(trans, ret); return ret; } @@ -1505,7 +1548,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, struct btrfs_fs_info *fs_info; struct btrfs_root *root; struct btrfs_key key; - int prev_bit = 0, bit; + bool prev_bit_set = false; /* Initialize to silence GCC. */ u64 extent_start = 0; u64 end, offset; @@ -1522,7 +1565,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, while (1) { ret = btrfs_next_item(root, path); if (ret < 0) - goto out; + return ret; if (ret) break; @@ -1536,10 +1579,12 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, offset = key.objectid; while (offset < key.objectid + key.offset) { - bit = free_space_test_bit(block_group, path, offset); - if (prev_bit == 0 && bit == 1) { + bool bit_set; + + bit_set = btrfs_free_space_test_bit(block_group, path, offset); + if (!prev_bit_set && bit_set) { extent_start = offset; - } else if (prev_bit == 1 && bit == 0) { + } else if (prev_bit_set && !bit_set) { u64 space_added; ret = btrfs_add_new_free_space(block_group, @@ -1547,7 +1592,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, offset, &space_added); if (ret) - goto out; + return ret; total_found += space_added; if (total_found > CACHING_CTL_WAKE_UP) { total_found = 0; @@ -1555,14 +1600,14 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, } extent_count++; } - prev_bit = bit; + prev_bit_set = bit_set; offset += fs_info->sectorsize; } } - if (prev_bit == 1) { + if (prev_bit_set) { ret = btrfs_add_new_free_space(block_group, extent_start, end, NULL); if (ret) - goto out; + return ret; extent_count++; } @@ -1572,13 +1617,10 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, block_group->start, extent_count, expected_extent_count); DEBUG_WARN(); - ret = -EIO; - goto out; + return -EIO; } - ret = 0; -out: - return ret; + return 0; } static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, @@ -1605,7 +1647,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, ret = btrfs_next_item(root, path); if (ret < 0) - goto out; + return ret; if (ret) break; @@ -1621,7 +1663,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, key.objectid + key.offset, &space_added); if (ret) - goto out; + return ret; total_found += space_added; if (total_found > CACHING_CTL_WAKE_UP) { total_found = 0; @@ -1636,16 +1678,13 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, block_group->start, extent_count, expected_extent_count); DEBUG_WARN(); - ret = -EIO; - goto out; + return -EIO; } - ret = 0; -out: - return ret; + return 0; } -int load_free_space_tree(struct btrfs_caching_control *caching_ctl) +int btrfs_load_free_space_tree(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group *block_group; struct btrfs_free_space_info *info; @@ -1666,7 +1705,7 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) path->search_commit_root = 1; path->reada = READA_FORWARD; - info = search_free_space_info(NULL, block_group, path, 0); + info = btrfs_search_free_space_info(NULL, block_group, path, 0); if (IS_ERR(info)) return PTR_ERR(info); diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index e6c6d6f4f221..3d9a5d4477fc 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -22,39 +22,39 @@ struct btrfs_trans_handle; #define BTRFS_FREE_SPACE_BITMAP_SIZE 256 #define BTRFS_FREE_SPACE_BITMAP_BITS (BTRFS_FREE_SPACE_BITMAP_SIZE * BITS_PER_BYTE) -void set_free_space_tree_thresholds(struct btrfs_block_group *block_group); +void btrfs_set_free_space_tree_thresholds(struct btrfs_block_group *block_group); int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info); int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info); int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info); -int load_free_space_tree(struct btrfs_caching_control *caching_ctl); -int add_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group); -int remove_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group); -int add_to_free_space_tree(struct btrfs_trans_handle *trans, - u64 start, u64 size); -int remove_from_free_space_tree(struct btrfs_trans_handle *trans, - u64 start, u64 size); +int btrfs_load_free_space_tree(struct btrfs_caching_control *caching_ctl); +int btrfs_add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group); +int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group); +int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, + u64 start, u64 size); +int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, + u64 start, u64 size); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_free_space_info * -search_free_space_info(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path, int cow); -int __add_to_free_space_tree(struct btrfs_trans_handle *trans, +btrfs_search_free_space_info(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, - struct btrfs_path *path, u64 start, u64 size); -int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path, u64 start, u64 size); -int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path); -int convert_free_space_to_extents(struct btrfs_trans_handle *trans, - struct btrfs_block_group *block_group, - struct btrfs_path *path); -int free_space_test_bit(struct btrfs_block_group *block_group, - struct btrfs_path *path, u64 offset); + struct btrfs_path *path, int cow); +int __btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 start, u64 size); +int __btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 start, u64 size); +int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path); +int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, + struct btrfs_path *path); +bool btrfs_free_space_test_bit(struct btrfs_block_group *block_group, + struct btrfs_path *path, u64 offset); #endif #endif diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 4394de12a767..8cc07cc70b12 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -420,6 +420,8 @@ struct btrfs_commit_stats { u64 last_commit_dur; /* The total commit duration in ns */ u64 total_commit_dur; + /* Start of the last critical section in ns. */ + u64 critical_section_start_time; }; struct btrfs_fs_info { @@ -713,8 +715,6 @@ struct btrfs_fs_info { u32 data_chunk_allocations; u32 metadata_ratio; - void *bdev_holder; - /* Private scrub information */ struct mutex scrub_lock; atomic_t scrubs_running; @@ -739,12 +739,6 @@ struct btrfs_fs_info { spinlock_t qgroup_lock; /* - * Used to avoid frequently calling ulist_alloc()/ulist_free() - * when doing qgroup accounting, it must be protected by qgroup_lock. - */ - struct ulist *qgroup_ulist; - - /* * Protect user change for quota operations. If a transaction is needed, * it must be started before locking this lock. */ @@ -779,7 +773,7 @@ struct btrfs_fs_info { struct btrfs_delayed_root *delayed_root; - /* Entries are eb->start / sectorsize */ + /* Entries are eb->start >> nodesize_bits */ struct xarray buffer_tree; /* Next backup root to be overwritten */ @@ -811,6 +805,7 @@ struct btrfs_fs_info { /* Cached block sizes */ u32 nodesize; + u32 nodesize_bits; u32 sectorsize; /* ilog2 of sectorsize, use to avoid 64bit division */ u32 sectorsize_bits; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index a61c3540d67b..f06cf701ae5a 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -78,13 +78,10 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( } /* Returns NULL if no extref found */ -struct btrfs_inode_extref * -btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const struct fscrypt_str *name, - u64 inode_objectid, u64 ref_objectid, int ins_len, - int cow) +struct btrfs_inode_extref *btrfs_lookup_inode_extref(struct btrfs_root *root, + struct btrfs_path *path, + const struct fscrypt_str *name, + u64 inode_objectid, u64 ref_objectid) { int ret; struct btrfs_key key; @@ -93,7 +90,7 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, key.type = BTRFS_INODE_EXTREF_KEY; key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len); - ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return ERR_PTR(ret); if (ret > 0) @@ -720,13 +717,12 @@ delete: } out: if (ret >= 0 && pending_del_nr) { - int err; + int ret2; - err = btrfs_del_items(trans, root, path, pending_del_slot, - pending_del_nr); - if (err) { - btrfs_abort_transaction(trans, err); - ret = err; + ret2 = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); + if (ret2) { + btrfs_abort_transaction(trans, ret2); + ret = ret2; } } diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index c11b97fdccc4..6d9f5ad20646 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -101,13 +101,10 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *location, int mod); -struct btrfs_inode_extref *btrfs_lookup_inode_extref( - struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const struct fscrypt_str *name, - u64 inode_objectid, u64 ref_objectid, int ins_len, - int cow); +struct btrfs_inode_extref *btrfs_lookup_inode_extref(struct btrfs_root *root, + struct btrfs_path *path, + const struct fscrypt_str *name, + u64 inode_objectid, u64 ref_objectid); struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf, int slot, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c0c778243bf1..b77dd22b8cdb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -308,7 +308,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, const u32 csum_size = root->fs_info->csum_size; /* For data reloc tree, it's better to do a backref lookup instead. */ - if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_is_data_reloc_root(root)) return print_data_reloc_error(inode, logical_start, csum, csum_expected, mirror_num); @@ -395,8 +395,8 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, u64 offset, u64 bytes) { - unsigned long index = offset >> PAGE_SHIFT; - unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; + pgoff_t index = offset >> PAGE_SHIFT; + const pgoff_t end_index = (offset + bytes - 1) >> PAGE_SHIFT; struct folio *folio; while (index <= end_index) { @@ -423,18 +423,18 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args) { - int err; + int ret; if (args->default_acl) { - err = __btrfs_set_acl(trans, args->inode, args->default_acl, + ret = __btrfs_set_acl(trans, args->inode, args->default_acl, ACL_TYPE_DEFAULT); - if (err) - return err; + if (ret) + return ret; } if (args->acl) { - err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS); - if (err) - return err; + ret = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS); + if (ret) + return ret; } if (!args->default_acl && !args->acl) cache_no_acl(args->inode); @@ -781,12 +781,15 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, return 0; } + /* Defrag ioctl takes precedence over mount options and properties. */ + if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS) + return 0; + if (BTRFS_COMPRESS_NONE < inode->defrag_compress && + inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) + return 1; /* force compress */ if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) return 1; - /* defrag ioctl */ - if (inode->defrag_compress) - return 1; /* bad compression ratios */ if (inode->flags & BTRFS_INODE_NOCOMPRESS) return 0; @@ -808,12 +811,11 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end) { - unsigned long end_index = end >> PAGE_SHIFT; + const pgoff_t end_index = end >> PAGE_SHIFT; struct folio *folio; int ret = 0; - for (unsigned long index = start >> PAGE_SHIFT; - index <= end_index; index++) { + for (pgoff_t index = start >> PAGE_SHIFT; index <= end_index; index++) { folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); if (IS_ERR(folio)) { if (!ret) @@ -943,7 +945,7 @@ again: goto cleanup_and_bail_uncompressed; } - if (inode->defrag_compress) { + if (0 < inode->defrag_compress && inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) { compress_type = inode->defrag_compress; compress_level = inode->defrag_compress_level; } else if (inode->prop_compress) { @@ -1755,7 +1757,8 @@ static int fallback_to_cow(struct btrfs_inode *inode, spin_unlock(&sinfo->lock); if (count > 0) - btrfs_clear_extent_bits(io_tree, start, end, EXTENT_NORESERVE); + btrfs_clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, + &cached_state); } btrfs_unlock_extent(io_tree, start, end, &cached_state); @@ -2328,8 +2331,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol * The range must cover part of the @locked_folio, or a return of 1 * can confuse the caller. */ - ASSERT(!(end <= folio_pos(locked_folio) || - start >= folio_pos(locked_folio) + folio_size(locked_folio))); + ASSERT(!(end <= folio_pos(locked_folio) || start >= folio_end(locked_folio))); if (should_nocow(inode, start, end)) { ret = run_delalloc_nocow(inode, locked_folio, start, end); @@ -2737,7 +2739,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct btrfs_inode *inode = fixup->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 page_start = folio_pos(folio); - u64 page_end = folio_pos(folio) + folio_size(folio) - 1; + u64 page_end = folio_end(folio) - 1; int ret = 0; bool free_delalloc_space = true; @@ -2881,7 +2883,7 @@ int btrfs_writepage_cow_fixup(struct folio *folio) DEBUG_WARN(); btrfs_err_rl(fs_info, "root %lld ino %llu folio %llu is marked dirty without notifying the fs", - BTRFS_I(inode)->root->root_key.objectid, + btrfs_root_id(BTRFS_I(inode)->root), btrfs_ino(BTRFS_I(inode)), folio_pos(folio)); return -EUCLEAN; @@ -3375,8 +3377,8 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, btrfs_test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, NULL)) { /* Skip the range without csum for data reloc inode */ - btrfs_clear_extent_bits(&inode->io_tree, file_offset, end, - EXTENT_NODATASUM); + btrfs_clear_extent_bit(&inode->io_tree, file_offset, end, + EXTENT_NODATASUM, NULL); return true; } @@ -3946,6 +3948,7 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), &inode->flags, &inode->ro_flags); btrfs_update_inode_mapping_flags(inode); + btrfs_set_inode_mapping_order(inode); cache_index: /* @@ -4078,45 +4081,35 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *item, struct inode *inode) { - struct btrfs_map_token token; u64 flags; - btrfs_init_map_token(&token, leaf); - - btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); - btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); - btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size); - btrfs_set_token_inode_mode(&token, item, inode->i_mode); - btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); - - btrfs_set_token_timespec_sec(&token, &item->atime, - inode_get_atime_sec(inode)); - btrfs_set_token_timespec_nsec(&token, &item->atime, - inode_get_atime_nsec(inode)); - - btrfs_set_token_timespec_sec(&token, &item->mtime, - inode_get_mtime_sec(inode)); - btrfs_set_token_timespec_nsec(&token, &item->mtime, - inode_get_mtime_nsec(inode)); - - btrfs_set_token_timespec_sec(&token, &item->ctime, - inode_get_ctime_sec(inode)); - btrfs_set_token_timespec_nsec(&token, &item->ctime, - inode_get_ctime_nsec(inode)); - - btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec); - btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec); - - btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); - btrfs_set_token_inode_generation(&token, item, - BTRFS_I(inode)->generation); - btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); - btrfs_set_token_inode_transid(&token, item, trans->transid); - btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); + btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); + btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); + btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); + btrfs_set_inode_mode(leaf, item, inode->i_mode); + btrfs_set_inode_nlink(leaf, item, inode->i_nlink); + + btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode)); + btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode)); + + btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode)); + btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode)); + + btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode)); + btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode)); + + btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec); + btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec); + + btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); + btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); + btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode)); + btrfs_set_inode_transid(leaf, item, trans->transid); + btrfs_set_inode_rdev(leaf, item, inode->i_rdev); flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, BTRFS_I(inode)->ro_flags); - btrfs_set_token_inode_flags(&token, item, flags); - btrfs_set_token_inode_block_group(&token, item, 0); + btrfs_set_inode_flags(leaf, item, flags); + btrfs_set_inode_block_group(leaf, item, 0); } /* @@ -4215,20 +4208,22 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, u64 dir_ino = btrfs_ino(dir); path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1); if (IS_ERR_OR_NULL(di)) { - ret = di ? PTR_ERR(di) : -ENOENT; - goto err; + btrfs_free_path(path); + return di ? PTR_ERR(di) : -ENOENT; } ret = btrfs_delete_one_dir_name(trans, root, path, di); + /* + * Down the call chains below we'll also need to allocate a path, so no + * need to hold on to this one for longer than necessary. + */ + btrfs_free_path(path); if (ret) - goto err; - btrfs_release_path(path); + return ret; /* * If we don't have dir index, we have to get it by looking up @@ -4250,11 +4245,11 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index); if (ret) { - btrfs_info(fs_info, - "failed to delete reference to %.*s, inode %llu parent %llu", - name->len, name->name, ino, dir_ino); + btrfs_crit(fs_info, + "failed to delete reference to %.*s, root %llu inode %llu parent %llu", + name->len, name->name, btrfs_root_id(root), ino, dir_ino); btrfs_abort_transaction(trans, ret); - goto err; + return ret; } skip_backref: if (rename_ctx) @@ -4263,7 +4258,7 @@ skip_backref: ret = btrfs_delete_delayed_dir_index(trans, dir, index); if (ret) { btrfs_abort_transaction(trans, ret); - goto err; + return ret; } /* @@ -4287,19 +4282,14 @@ skip_backref: * holding. */ btrfs_run_delayed_iput(fs_info, inode); -err: - btrfs_free_path(path); - if (ret) - goto out; btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); inode_inc_iversion(&inode->vfs_inode); inode_set_ctime_current(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); - ret = btrfs_update_inode(trans, dir); -out: - return ret; + + return btrfs_update_inode(trans, dir); } int btrfs_unlink_inode(struct btrfs_trans_handle *trans, @@ -4704,68 +4694,68 @@ out_up_write: return ret; } -static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) +static int btrfs_rmdir(struct inode *vfs_dir, struct dentry *dentry) { - struct inode *inode = d_inode(dentry); - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_inode *dir = BTRFS_I(vfs_dir); + struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); + struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; struct btrfs_trans_handle *trans; - u64 last_unlink_trans; struct fscrypt_name fname; - if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) + if (inode->vfs_inode.i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; - if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) { + if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) { btrfs_err(fs_info, "extent tree v2 doesn't support snapshot deletion yet"); return -EOPNOTSUPP; } - return btrfs_delete_subvolume(BTRFS_I(dir), dentry); + return btrfs_delete_subvolume(dir, dentry); } - ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + ret = fscrypt_setup_filename(vfs_dir, &dentry->d_name, 1, &fname); if (ret) return ret; /* This needs to handle no-key deletions later on */ - trans = __unlink_start_trans(BTRFS_I(dir)); + trans = __unlink_start_trans(dir); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_notrans; } - if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); + /* + * Propagate the last_unlink_trans value of the deleted dir to its + * parent directory. This is to prevent an unrecoverable log tree in the + * case we do something like this: + * 1) create dir foo + * 2) create snapshot under dir foo + * 3) delete the snapshot + * 4) rmdir foo + * 5) mkdir foo + * 6) fsync foo or some file inside foo + * + * This is because we can't unlink other roots when replaying the dir + * deletes for directory foo. + */ + if (inode->last_unlink_trans >= trans->transid) + btrfs_record_snapshot_destroy(trans, dir); + + if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { + ret = btrfs_unlink_subvol(trans, dir, dentry); goto out; } - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + ret = btrfs_orphan_add(trans, inode); if (ret) goto out; - last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; - /* now the directory is empty */ - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), - &fname.disk_name); - if (!ret) { - btrfs_i_size_write(BTRFS_I(inode), 0); - /* - * Propagate the last_unlink_trans value of the deleted dir to - * its parent directory. This is to prevent an unrecoverable - * log tree in the case we do something like this: - * 1) create dir foo - * 2) create snapshot under dir foo - * 3) delete the snapshot - * 4) rmdir foo - * 5) mkdir foo - * 6) fsync foo or some file inside foo - */ - if (last_unlink_trans >= trans->transid) - BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; - } + ret = btrfs_unlink_inode(trans, dir, inode, &fname.disk_name); + if (!ret) + btrfs_i_size_write(inode, 0); out: btrfs_end_transaction(trans); out_notrans: @@ -4821,9 +4811,9 @@ again: */ zero_start = max_t(u64, folio_pos(folio), start); - zero_end = folio_pos(folio) + folio_size(folio) - 1; + zero_end = folio_end(folio); folio_zero_range(folio, zero_start - folio_pos(folio), - zero_end - zero_start + 1); + zero_end - zero_start); out_unlock: folio_unlock(folio); @@ -4861,7 +4851,6 @@ int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 e pgoff_t index = (offset >> PAGE_SHIFT); struct folio *folio; gfp_t mask = btrfs_alloc_write_mask(mapping); - size_t write_bytes = blocksize; int ret = 0; const bool in_head_block = is_inside_block(offset, round_down(start, blocksize), blocksize); @@ -4913,8 +4902,12 @@ int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 e ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, blocksize, false); if (ret < 0) { + size_t write_bytes = blocksize; + if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) { - /* For nocow case, no need to reserve data space */ + /* For nocow case, no need to reserve data space. */ + ASSERT(write_bytes == blocksize, "write_bytes=%zu blocksize=%u", + write_bytes, blocksize); only_release_metadata = true; } else { goto out; @@ -5001,8 +4994,7 @@ again: * not reach disk, it still affects our page caches. */ zero_start = max_t(u64, folio_pos(folio), start); - zero_end = min_t(u64, folio_pos(folio) + folio_size(folio) - 1, - end); + zero_end = min_t(u64, folio_end(folio) - 1, end); } else { zero_start = max_t(u64, block_start, start); zero_end = min_t(u64, block_end, end); @@ -5014,11 +5006,12 @@ again: block_end + 1 - block_start); btrfs_folio_set_dirty(fs_info, folio, block_start, block_end + 1 - block_start); - btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) btrfs_set_extent_bit(&inode->io_tree, block_start, block_end, - EXTENT_NORESERVE, NULL); + EXTENT_NORESERVE, &cached_state); + + btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); out_unlock: if (ret) { @@ -5256,7 +5249,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize); if (ret && inode->i_nlink) { - int err; + int ret2; /* * Truncate failed, so fix up the in-memory size. We @@ -5264,9 +5257,9 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * wait for disk_i_size to be stable and then update the * in-memory size to match. */ - err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); - if (err) - return err; + ret2 = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); + if (ret2) + return ret2; i_size_write(inode, BTRFS_I(inode)->disk_i_size); } } @@ -5279,31 +5272,31 @@ static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, { struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; - int err; + int ret; if (btrfs_root_readonly(root)) return -EROFS; - err = setattr_prepare(idmap, dentry, attr); - if (err) - return err; + ret = setattr_prepare(idmap, dentry, attr); + if (ret) + return ret; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - err = btrfs_setsize(inode, attr); - if (err) - return err; + ret = btrfs_setsize(inode, attr); + if (ret) + return ret; } if (attr->ia_valid) { setattr_copy(idmap, inode, attr); inode_inc_iversion(inode); - err = btrfs_dirty_inode(BTRFS_I(inode)); + ret = btrfs_dirty_inode(BTRFS_I(inode)); - if (!err && attr->ia_valid & ATTR_MODE) - err = posix_acl_chmod(idmap, dentry, inode->i_mode); + if (!ret && attr->ia_valid & ATTR_MODE) + ret = posix_acl_chmod(idmap, dentry, inode->i_mode); } - return err; + return ret; } /* @@ -5437,7 +5430,7 @@ void btrfs_evict_inode(struct inode *inode) struct btrfs_fs_info *fs_info; struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_block_rsv *rsv = NULL; + struct btrfs_block_rsv rsv; int ret; trace_btrfs_inode_evict(inode); @@ -5485,11 +5478,9 @@ void btrfs_evict_inode(struct inode *inode) */ btrfs_kill_delayed_inode_items(BTRFS_I(inode)); - rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); - if (!rsv) - goto out; - rsv->size = btrfs_calc_metadata_size(fs_info, 1); - rsv->failfast = true; + btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP); + rsv.size = btrfs_calc_metadata_size(fs_info, 1); + rsv.failfast = true; btrfs_i_size_write(BTRFS_I(inode), 0); @@ -5501,11 +5492,11 @@ void btrfs_evict_inode(struct inode *inode) .min_type = 0, }; - trans = evict_refill_and_join(root, rsv); + trans = evict_refill_and_join(root, &rsv); if (IS_ERR(trans)) - goto out; + goto out_release; - trans->block_rsv = rsv; + trans->block_rsv = &rsv; ret = btrfs_truncate_inode_items(trans, root, &control); trans->block_rsv = &fs_info->trans_block_rsv; @@ -5517,7 +5508,7 @@ void btrfs_evict_inode(struct inode *inode) */ btrfs_btree_balance_dirty_nodelay(fs_info); if (ret && ret != -ENOSPC && ret != -EAGAIN) - goto out; + goto out_release; else if (!ret) break; } @@ -5531,16 +5522,17 @@ void btrfs_evict_inode(struct inode *inode) * If it turns out that we are dropping too many of these, we might want * to add a mechanism for retrying these after a commit. */ - trans = evict_refill_and_join(root, rsv); + trans = evict_refill_and_join(root, &rsv); if (!IS_ERR(trans)) { - trans->block_rsv = rsv; + trans->block_rsv = &rsv; btrfs_orphan_del(trans, BTRFS_I(inode)); trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); } +out_release: + btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL); out: - btrfs_free_block_rsv(fs_info, rsv); /* * If we didn't successfully delete, the orphan item will still be in * the tree and we'll retry on the next mount. Again, we might also want @@ -6173,8 +6165,7 @@ again: if (ret) goto nopos; - ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); - if (ret) + if (btrfs_readdir_delayed_dir_index(ctx, &ins_list)) goto nopos; /* @@ -6467,6 +6458,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; btrfs_update_inode_mapping_flags(BTRFS_I(inode)); + btrfs_set_inode_mapping_order(BTRFS_I(inode)); } ret = btrfs_insert_inode_locked(inode); @@ -6610,13 +6602,17 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (args->orphan) { ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto discard; + } } else { ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 0, BTRFS_I(inode)->dir_index); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto discard; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto discard; + } } return 0; @@ -6703,20 +6699,18 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, fail_dir_item: if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { u64 local_index; - int err; - err = btrfs_del_root_ref(trans, key.objectid, - btrfs_root_id(root), parent_ino, - &local_index, name); - if (err) - btrfs_abort_transaction(trans, err); + int ret2; + + ret2 = btrfs_del_root_ref(trans, key.objectid, btrfs_root_id(root), + parent_ino, &local_index, name); + if (ret2) + btrfs_abort_transaction(trans, ret2); } else if (add_backref) { - u64 local_index; - int err; + int ret2; - err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, - &local_index); - if (err) - btrfs_abort_transaction(trans, err); + ret2 = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, NULL); + if (ret2) + btrfs_abort_transaction(trans, ret2); } /* Return the original error code */ @@ -6735,20 +6729,20 @@ static int btrfs_create_common(struct inode *dir, struct dentry *dentry, }; unsigned int trans_num_items; struct btrfs_trans_handle *trans; - int err; + int ret; - err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); - if (err) + ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); + if (ret) goto out_inode; trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_new_inode_args; } - err = btrfs_create_new_inode(trans, &new_inode_args); - if (!err) + ret = btrfs_create_new_inode(trans, &new_inode_args); + if (!ret) d_instantiate_new(dentry, inode); btrfs_end_transaction(trans); @@ -6756,9 +6750,9 @@ static int btrfs_create_common(struct inode *dir, struct dentry *dentry, out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: - if (err) + if (ret) iput(inode); - return err; + return ret; } static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir, @@ -6799,7 +6793,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct fscrypt_name fname; u64 index; - int err; + int ret; int drop_inode = 0; /* do not allow sys_link's with other subvols of the same device */ @@ -6809,12 +6803,12 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink >= BTRFS_LINK_MAX) return -EMLINK; - err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); - if (err) + ret = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); + if (ret) goto fail; - err = btrfs_set_inode_index(BTRFS_I(dir), &index); - if (err) + ret = btrfs_set_inode_index(BTRFS_I(dir), &index); + if (ret) goto fail; /* @@ -6825,7 +6819,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, */ trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); trans = NULL; goto fail; } @@ -6838,24 +6832,24 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); - err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), + ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), &fname.disk_name, 1, index); - if (err) { + if (ret) { drop_inode = 1; } else { struct dentry *parent = dentry->d_parent; - err = btrfs_update_inode(trans, BTRFS_I(inode)); - if (err) + ret = btrfs_update_inode(trans, BTRFS_I(inode)); + if (ret) goto fail; if (inode->i_nlink == 1) { /* * If new hard link count is 1, it's a file created * with open(2) O_TMPFILE flag. */ - err = btrfs_orphan_del(trans, BTRFS_I(inode)); - if (err) + ret = btrfs_orphan_del(trans, BTRFS_I(inode)); + if (ret) goto fail; } d_instantiate(dentry, inode); @@ -6871,7 +6865,7 @@ fail: iput(inode); } btrfs_btree_balance_dirty(fs_info); - return err; + return ret; } static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, @@ -7364,13 +7358,13 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, static void wait_subpage_spinlock(struct folio *folio) { struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; if (!btrfs_is_subpage(fs_info, folio)) return; ASSERT(folio_test_private(folio) && folio_get_private(folio)); - subpage = folio_get_private(folio); + bfs = folio_get_private(folio); /* * This may look insane as we just acquire the spinlock and release it, @@ -7383,8 +7377,8 @@ static void wait_subpage_spinlock(struct folio *folio) * Here we just acquire the spinlock so that all existing callers * should exit and we're safe to release/invalidate the page. */ - spin_lock_irq(&subpage->lock); - spin_unlock_irq(&subpage->lock); + spin_lock_irq(&bfs->lock); + spin_unlock_irq(&bfs->lock); } static int btrfs_launder_folio(struct folio *folio) @@ -7607,7 +7601,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *rsv; + struct btrfs_block_rsv rsv; int ret; struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; @@ -7649,11 +7643,9 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) * 2) fs_info->trans_block_rsv - this will have 1 items worth left for * updating the inode. */ - rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); - if (!rsv) - return -ENOMEM; - rsv->size = min_size; - rsv->failfast = true; + btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP); + rsv.size = min_size; + rsv.failfast = true; /* * 1 for the truncate slack space @@ -7666,7 +7658,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) } /* Migrate the slack space for the truncate to our reserve */ - ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, + ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv, min_size, false); /* * We have reserved 2 metadata units when we started the transaction and @@ -7678,7 +7670,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) goto out; } - trans->block_rsv = rsv; + trans->block_rsv = &rsv; while (1) { struct extent_state *cached_state = NULL; @@ -7721,9 +7713,9 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) break; } - btrfs_block_rsv_release(fs_info, rsv, -1, NULL); + btrfs_block_rsv_release(fs_info, &rsv, -1, NULL); ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, - rsv, min_size, false); + &rsv, min_size, false); /* * We have reserved 2 metadata units when we started the * transaction and min_size matches 1 unit, so this should never @@ -7732,7 +7724,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) if (WARN_ON(ret)) break; - trans->block_rsv = rsv; + trans->block_rsv = &rsv; } /* @@ -7771,7 +7763,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) btrfs_btree_balance_dirty(fs_info); } out: - btrfs_free_block_rsv(fs_info, rsv); + btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL); /* * So if we truncate and then write and fsync we normally would just * write the extents that changed, which is a problem if we need to @@ -8026,7 +8018,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap, generic_fillattr(idmap, request_mask, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; - stat->subvol = BTRFS_I(inode)->root->root_key.objectid; + stat->subvol = btrfs_root_id(BTRFS_I(inode)->root); stat->result_mask |= STATX_SUBVOL; spin_lock(&BTRFS_I(inode)->lock); @@ -8059,6 +8051,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, int ret; int ret2; bool need_abort = false; + bool logs_pinned = false; struct fscrypt_name old_fname, new_fname; struct fscrypt_str *old_name, *new_name; @@ -8182,6 +8175,31 @@ static int btrfs_rename_exchange(struct inode *old_dir, inode_inc_iversion(new_inode); simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + if (old_ino != BTRFS_FIRST_FREE_OBJECTID && + new_ino != BTRFS_FIRST_FREE_OBJECTID) { + /* + * If we are renaming in the same directory (and it's not for + * root entries) pin the log early to prevent any concurrent + * task from logging the directory after we removed the old + * entries and before we add the new entries, otherwise that + * task can sync a log without any entry for the inodes we are + * renaming and therefore replaying that log, if a power failure + * happens after syncing the log, would result in deleting the + * inodes. + * + * If the rename affects two different directories, we want to + * make sure the that there's no log commit that contains + * updates for only one of the directories but not for the + * other. + * + * If we are renaming an entry for a root, we don't care about + * log updates since we called btrfs_set_log_full_commit(). + */ + btrfs_pin_log_trans(root); + btrfs_pin_log_trans(dest); + logs_pinned = true; + } + if (old_dentry->d_parent != new_dentry->d_parent) { btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), true); @@ -8253,30 +8271,23 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode)->dir_index = new_idx; /* - * Now pin the logs of the roots. We do it to ensure that no other task - * can sync the logs while we are in progress with the rename, because - * that could result in an inconsistency in case any of the inodes that - * are part of this rename operation were logged before. + * Do the log updates for all inodes. + * + * If either entry is for a root we don't need to update the logs since + * we've called btrfs_set_log_full_commit() before. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) - btrfs_pin_log_trans(root); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) - btrfs_pin_log_trans(dest); - - /* Do the log updates for all inodes. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + if (logs_pinned) { btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), old_rename_ctx.index, new_dentry->d_parent); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir), new_rename_ctx.index, old_dentry->d_parent); + } - /* Now unpin the logs. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) +out_fail: + if (logs_pinned) { btrfs_end_log_trans(root); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_end_log_trans(dest); -out_fail: + } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: @@ -8326,6 +8337,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); struct fscrypt_name old_fname, new_fname; + bool logs_pinned = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -8460,6 +8472,29 @@ static int btrfs_rename(struct mnt_idmap *idmap, inode_inc_iversion(old_inode); simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + /* + * If we are renaming in the same directory (and it's not a + * root entry) pin the log to prevent any concurrent task from + * logging the directory after we removed the old entry and + * before we add the new entry, otherwise that task can sync + * a log without any entry for the inode we are renaming and + * therefore replaying that log, if a power failure happens + * after syncing the log, would result in deleting the inode. + * + * If the rename affects two different directories, we want to + * make sure the that there's no log commit that contains + * updates for only one of the directories but not for the + * other. + * + * If we are renaming an entry for a root, we don't care about + * log updates since we called btrfs_set_log_full_commit(). + */ + btrfs_pin_log_trans(root); + btrfs_pin_log_trans(dest); + logs_pinned = true; + } + if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), true); @@ -8524,7 +8559,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + if (logs_pinned) btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), rename_ctx.index, new_dentry->d_parent); @@ -8540,6 +8575,10 @@ static int btrfs_rename(struct mnt_idmap *idmap, } } out_fail: + if (logs_pinned) { + btrfs_end_log_trans(root); + btrfs_end_log_trans(dest); + } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: @@ -8776,7 +8815,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, .dentry = dentry, }; unsigned int trans_num_items; - int err; + int ret; int name_len; int datasize; unsigned long ptr; @@ -8803,26 +8842,26 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, inode_set_bytes(inode, name_len); new_inode_args.inode = inode; - err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); - if (err) + ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); + if (ret) goto out_inode; /* 1 additional item for the inline extent */ trans_num_items++; trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_new_inode_args; } - err = btrfs_create_new_inode(trans, &new_inode_args); - if (err) + ret = btrfs_create_new_inode(trans, &new_inode_args); + if (ret) goto out; path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; - btrfs_abort_transaction(trans, err); + ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); discard_new_inode(inode); inode = NULL; goto out; @@ -8831,10 +8870,9 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; datasize = btrfs_file_extent_calc_inline_size(name_len); - err = btrfs_insert_empty_item(trans, root, path, &key, - datasize); - if (err) { - btrfs_abort_transaction(trans, err); + ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); + if (ret) { + btrfs_abort_transaction(trans, ret); btrfs_free_path(path); discard_new_inode(inode); inode = NULL; @@ -8856,16 +8894,16 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, btrfs_free_path(path); d_instantiate_new(dentry, inode); - err = 0; + ret = 0; out: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: - if (err) + if (ret) iput(inode); - return err; + return ret; } static struct btrfs_trans_handle *insert_prealloc_file_extent( diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 913acef3f0a9..7e13de2bdcbf 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -245,7 +245,7 @@ static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_ * Set flags/xflags from the internal inode flags. The remaining items of * fsxattr are zeroed. */ -int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int btrfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { const struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); @@ -254,7 +254,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) } int btrfs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); struct btrfs_root *root = inode->root; @@ -666,14 +666,14 @@ static noinline int create_subvol(struct mnt_idmap *idmap, goto out; } + btrfs_record_new_subvolume(trans, BTRFS_I(dir)); + ret = btrfs_create_new_inode(trans, &new_inode_args); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } - btrfs_record_new_subvolume(trans, BTRFS_I(dir)); - d_instantiate_new(dentry, new_inode_args.inode); new_inode_args.inode = NULL; @@ -841,7 +841,7 @@ free_pending: static int btrfs_may_delete(struct mnt_idmap *idmap, struct inode *dir, struct dentry *victim, int isdir) { - int error; + int ret; if (d_really_is_negative(victim)) return -ENOENT; @@ -851,9 +851,9 @@ static int btrfs_may_delete(struct mnt_idmap *idmap, return -EINVAL; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); - error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); - if (error) - return error; + ret = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); + if (ret) + return ret; if (IS_APPEND(dir)) return -EPERM; if (check_sticky(idmap, dir, d_inode(victim)) || @@ -892,39 +892,37 @@ static inline int btrfs_may_create(struct mnt_idmap *idmap, * sys_mkdirat and vfs_mkdir, but we only do a single component lookup * inside this filesystem so it's quite a bit simpler. */ -static noinline int btrfs_mksubvol(const struct path *parent, +static noinline int btrfs_mksubvol(struct dentry *parent, struct mnt_idmap *idmap, - const char *name, int namelen, - struct btrfs_root *snap_src, + struct qstr *qname, struct btrfs_root *snap_src, bool readonly, struct btrfs_qgroup_inherit *inherit) { - struct inode *dir = d_inode(parent->dentry); + struct inode *dir = d_inode(parent); struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct dentry *dentry; - struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen); - int error; + struct fscrypt_str name_str = FSTR_INIT((char *)qname->name, qname->len); + int ret; - error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); - if (error == -EINTR) - return error; + ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); + if (ret == -EINTR) + return ret; - dentry = lookup_one(idmap, &QSTR_LEN(name, namelen), parent->dentry); - error = PTR_ERR(dentry); + dentry = lookup_one(idmap, qname, parent); + ret = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_unlock; - error = btrfs_may_create(idmap, dir, dentry); - if (error) + ret = btrfs_may_create(idmap, dir, dentry); + if (ret) goto out_dput; /* * even if this name doesn't exist, we may get hash collisions. * check for them now when we can safely fail */ - error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, - dir->i_ino, &name_str); - if (error) + ret = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, dir->i_ino, &name_str); + if (ret) goto out_dput; down_read(&fs_info->subvol_sem); @@ -933,11 +931,11 @@ static noinline int btrfs_mksubvol(const struct path *parent, goto out_up_read; if (snap_src) - error = create_snapshot(snap_src, dir, dentry, readonly, inherit); + ret = create_snapshot(snap_src, dir, dentry, readonly, inherit); else - error = create_subvol(idmap, dir, dentry, inherit); + ret = create_subvol(idmap, dir, dentry, inherit); - if (!error) + if (!ret) fsnotify_mkdir(dir, dentry); out_up_read: up_read(&fs_info->subvol_sem); @@ -945,12 +943,12 @@ out_dput: dput(dentry); out_unlock: btrfs_inode_unlock(BTRFS_I(dir), 0); - return error; + return ret; } -static noinline int btrfs_mksnapshot(const struct path *parent, +static noinline int btrfs_mksnapshot(struct dentry *parent, struct mnt_idmap *idmap, - const char *name, int namelen, + struct qstr *qname, struct btrfs_root *root, bool readonly, struct btrfs_qgroup_inherit *inherit) @@ -977,8 +975,8 @@ static noinline int btrfs_mksnapshot(const struct path *parent, btrfs_wait_ordered_extents(root, U64_MAX, NULL); - ret = btrfs_mksubvol(parent, idmap, name, namelen, - root, readonly, inherit); + ret = btrfs_mksubvol(parent, idmap, qname, root, readonly, inherit); + atomic_dec(&root->snapshot_force_cow); out: btrfs_drew_read_unlock(&root->snapshot_lock); @@ -1169,7 +1167,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, } /* equal, nothing need to do */ if (ret == 0 && new_size != old_size) - btrfs_info_in_rcu(fs_info, + btrfs_info(fs_info, "resize device %s (devid %llu) from %llu to %llu", btrfs_dev_name(device), device->devid, old_size, new_size); @@ -1184,12 +1182,12 @@ out_drop: static noinline int __btrfs_ioctl_snap_create(struct file *file, struct mnt_idmap *idmap, - const char *name, unsigned long fd, int subvol, + const char *name, unsigned long fd, bool subvol, bool readonly, struct btrfs_qgroup_inherit *inherit) { - int namelen; int ret = 0; + struct qstr qname = QSTR_INIT(name, strlen(name)); if (!S_ISDIR(file_inode(file)->i_mode)) return -ENOTDIR; @@ -1198,21 +1196,20 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, if (ret) goto out; - namelen = strlen(name); if (strchr(name, '/')) { ret = -EINVAL; goto out_drop_write; } - if (name[0] == '.' && - (namelen == 1 || (name[1] == '.' && namelen == 2))) { + if (qname.name[0] == '.' && + (qname.len == 1 || (qname.name[1] == '.' && qname.len == 2))) { ret = -EEXIST; goto out_drop_write; } if (subvol) { - ret = btrfs_mksubvol(&file->f_path, idmap, name, - namelen, NULL, readonly, inherit); + ret = btrfs_mksubvol(file_dentry(file), idmap, &qname, NULL, + readonly, inherit); } else { CLASS(fd, src)(fd); struct inode *src_inode; @@ -1242,8 +1239,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, */ ret = -EINVAL; } else { - ret = btrfs_mksnapshot(&file->f_path, idmap, - name, namelen, + ret = btrfs_mksnapshot(file_dentry(file), idmap, &qname, BTRFS_I(src_inode)->root, readonly, inherit); } @@ -1280,7 +1276,7 @@ out: } static noinline int btrfs_ioctl_snap_create_v2(struct file *file, - void __user *arg, int subvol) + void __user *arg, bool subvol) { struct btrfs_ioctl_vol_args_v2 *vol_args; int ret; @@ -2558,8 +2554,14 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EOPNOTSUPP; goto out; } - /* compression requires us to start the IO */ - if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { + if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS) && + (range.flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS)) { + ret = -EINVAL; + goto out; + } + /* Compression or no-compression require to start the IO. */ + if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS) || + (range.flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS)) { range.flags |= BTRFS_DEFRAG_RANGE_START_IO; range.extent_thresh = (u32)-1; } @@ -2700,7 +2702,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) err_drop: mnt_drop_write_file(file); if (bdev_file) - fput(bdev_file); + bdev_fput(bdev_file); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); @@ -2751,7 +2753,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) mnt_drop_write_file(file); if (bdev_file) - fput(bdev_file); + bdev_fput(bdev_file); out: btrfs_put_dev_args_from_path(&args); out_free: @@ -2890,7 +2892,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) ret = PTR_ERR(new_root); goto out; } - if (!is_fstree(btrfs_root_id(new_root))) { + if (!btrfs_is_fstree(btrfs_root_id(new_root))) { ret = -ENOENT; goto out_free; } @@ -3139,7 +3141,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg) return -EPERM; if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { - btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet"); + btrfs_err(fs_info, "scrub: extent tree v2 not yet supported"); return -EINVAL; } @@ -3357,7 +3359,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, int size; struct btrfs_ioctl_logical_ino_args *loi; struct btrfs_data_container *inodes = NULL; - struct btrfs_path *path = NULL; bool ignore_offset; if (!capable(CAP_SYS_ADMIN)) @@ -3391,14 +3392,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, goto out_loi; } - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - ret = iterate_inodes_from_logical(loi->logical, fs_info, path, - inodes, ignore_offset); - btrfs_free_path(path); + ret = iterate_inodes_from_logical(loi->logical, fs_info, inodes, ignore_offset); if (ret == -EINVAL) ret = -ENOENT; if (ret < 0) @@ -3715,22 +3709,6 @@ drop_write: return ret; } -/* - * Quick check for ioctl handlers if quotas are enabled. Proper locking must be - * done before any operations. - */ -static bool qgroup_enabled(struct btrfs_fs_info *fs_info) -{ - bool ret = true; - - mutex_lock(&fs_info->qgroup_ioctl_lock); - if (!fs_info->quota_root) - ret = false; - mutex_unlock(&fs_info->qgroup_ioctl_lock); - - return ret; -} - static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); @@ -3745,7 +3723,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!qgroup_enabled(root->fs_info)) + if (!btrfs_qgroup_enabled(fs_info)) return -ENOTCONN; ret = mnt_want_write_file(file); @@ -3815,7 +3793,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!qgroup_enabled(root->fs_info)) + if (!btrfs_qgroup_enabled(root->fs_info)) return -ENOTCONN; ret = mnt_want_write_file(file); @@ -3833,7 +3811,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) goto out; } - if (sa->create && is_fstree(sa->qgroupid)) { + if (sa->create && btrfs_is_fstree(sa->qgroupid)) { ret = -EINVAL; goto out; } @@ -3874,7 +3852,7 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!qgroup_enabled(root->fs_info)) + if (!btrfs_qgroup_enabled(root->fs_info)) return -ENOTCONN; ret = mnt_want_write_file(file); @@ -3922,7 +3900,7 @@ static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!qgroup_enabled(fs_info)) + if (!btrfs_qgroup_enabled(fs_info)) return -ENOTCONN; ret = mnt_want_write_file(file); @@ -4200,7 +4178,7 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) } spin_lock(&fs_info->super_lock); - strcpy(super_block->label, label); + strscpy(super_block->label, label); spin_unlock(&fs_info->super_lock); ret = btrfs_commit_transaction(trans); @@ -4629,6 +4607,13 @@ out_acct: return ret; } +struct btrfs_uring_encoded_data { + struct btrfs_ioctl_encoded_io_args args; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov; + struct iov_iter iter; +}; + /* * Context that's attached to an encoded read io_uring command, in cmd->pdu. It * contains the fields in btrfs_uring_read_extent that are necessary to finish @@ -4650,6 +4635,7 @@ struct btrfs_uring_priv { }; struct io_btrfs_cmd { + struct btrfs_uring_encoded_data *data; struct btrfs_uring_priv *priv; }; @@ -4659,7 +4645,7 @@ static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int iss struct btrfs_uring_priv *priv = bc->priv; struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp)); struct extent_io_tree *io_tree = &inode->io_tree; - unsigned long index; + pgoff_t index; u64 cur; size_t page_offset; ssize_t ret; @@ -4708,6 +4694,7 @@ out: kfree(priv->pages); kfree(priv->iov); kfree(priv); + kfree(bc->data); } void btrfs_uring_read_extent_endio(void *ctx, int err) @@ -4791,13 +4778,6 @@ out_fail: return ret; } -struct btrfs_uring_encoded_data { - struct btrfs_ioctl_encoded_io_args args; - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov; - struct iov_iter iter; -}; - static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags) { size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); @@ -4813,7 +4793,11 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue struct extent_state *cached_state = NULL; u64 start, lockend; void __user *sqe_addr; - struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data; + struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); + struct btrfs_uring_encoded_data *data = NULL; + + if (cmd->flags & IORING_URING_CMD_REISSUE) + data = bc->data; if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; @@ -4829,7 +4813,8 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags); #else - return -ENOTTY; + ret = -ENOTTY; + goto out_acct; #endif } else { copy_end = copy_end_kernel; @@ -4842,7 +4827,7 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue goto out_acct; } - io_uring_cmd_get_async_data(cmd)->op_data = data; + bc->data = data; if (issue_flags & IO_URING_F_COMPAT) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) @@ -4940,6 +4925,9 @@ out_acct: add_rchar(current, ret); inc_syscr(current); + if (ret != -EIOCBQUEUED && ret != -EAGAIN) + kfree(data); + return ret; } @@ -4950,7 +4938,11 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu struct file *file; ssize_t ret; void __user *sqe_addr; - struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data; + struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); + struct btrfs_uring_encoded_data *data = NULL; + + if (cmd->flags & IORING_URING_CMD_REISSUE) + data = bc->data; if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; @@ -4972,7 +4964,7 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu goto out_acct; } - io_uring_cmd_get_async_data(cmd)->op_data = data; + bc->data = data; if (issue_flags & IO_URING_F_COMPAT) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) @@ -5062,6 +5054,9 @@ out_acct: if (ret > 0) add_wchar(current, ret); inc_syscw(current); + + if (ret != -EAGAIN) + kfree(data); return ret; } diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index e08ea446cf48..ccf6bed9cc24 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -8,7 +8,7 @@ struct file; struct dentry; struct mnt_idmap; -struct fileattr; +struct file_kattr; struct io_uring_cmd; struct btrfs_inode; struct btrfs_fs_info; @@ -16,9 +16,9 @@ struct btrfs_ioctl_balance_args; long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int btrfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa); int btrfs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 6abf81bb00c2..022ebc89af85 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -37,106 +37,46 @@ void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); btrfs_no_printk(fs_info, fmt, ##args) #endif -#define btrfs_emerg(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_INFO fmt, ##args) - /* - * Wrappers that use printk_in_rcu + * Print a message with filesystem info, enclosed in RCU protection. */ -#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit_in_rcu(fs_info, fmt, args...) \ +#define btrfs_crit(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err_in_rcu(fs_info, fmt, args...) \ +#define btrfs_err(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn_in_rcu(fs_info, fmt, args...) \ +#define btrfs_warn(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info_in_rcu(fs_info, fmt, args...) \ +#define btrfs_info(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args) /* - * Wrappers that use a ratelimited printk_in_rcu - */ -#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) - -/* * Wrappers that use a ratelimited printk */ -#define btrfs_emerg_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args) #define btrfs_crit_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) #define btrfs_err_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) #define btrfs_warn_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) #define btrfs_info_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) #if defined(CONFIG_DYNAMIC_DEBUG) #define btrfs_debug(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk, \ - fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \ fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \ - fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_rl(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \ + _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \ fs_info, KERN_DEBUG fmt, ##args) #elif defined(DEBUG) #define btrfs_debug(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args) + btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) #else -#define btrfs_debug(fs_info, fmt, args...) \ - btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ - btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl(fs_info, fmt, args...) \ - btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) +/* When printk() is no_printk(), expand to no-op. */ +#define btrfs_debug(fs_info, fmt, args...) do { (void)(fs_info); } while(0) +#define btrfs_debug_rl(fs_info, fmt, args...) do { (void)(fs_info); } while(0) #endif #define btrfs_printk_in_rcu(fs_info, fmt, args...) \ @@ -146,26 +86,15 @@ do { \ rcu_read_unlock(); \ } while (0) -#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \ -do { \ - rcu_read_lock(); \ - btrfs_no_printk(fs_info, fmt, ##args); \ - rcu_read_unlock(); \ -} while (0) - -#define btrfs_printk_ratelimited(fs_info, fmt, args...) \ +#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ + \ + rcu_read_lock(); \ if (__ratelimit(&_rs)) \ btrfs_printk(fs_info, fmt, ##args); \ -} while (0) - -#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ -do { \ - rcu_read_lock(); \ - btrfs_printk_ratelimited(fs_info, fmt, ##args); \ rcu_read_unlock(); \ } while (0) diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 0d599fd847c9..ff5eac84d819 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -7,6 +7,8 @@ #include <linux/bitmap.h> #include <linux/sched.h> #include <linux/wait.h> +#include <linux/mm.h> +#include <linux/pagemap.h> #include <linux/math64.h> #include <linux/rbtree.h> @@ -119,28 +121,23 @@ static inline struct rb_node *rb_simple_search_first(const struct rb_root *root, return ret; } -static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr, - struct rb_node *node) +static int rb_simple_node_bytenr_cmp(struct rb_node *new, const struct rb_node *existing) { - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct rb_simple_node *entry; + struct rb_simple_node *new_entry = rb_entry(new, struct rb_simple_node, rb_node); + struct rb_simple_node *existing_entry = rb_entry(existing, struct rb_simple_node, rb_node); - while (*p) { - parent = *p; - entry = rb_entry(parent, struct rb_simple_node, rb_node); + if (new_entry->bytenr < existing_entry->bytenr) + return -1; + else if (new_entry->bytenr > existing_entry->bytenr) + return 1; - if (bytenr < entry->bytenr) - p = &(*p)->rb_left; - else if (bytenr > entry->bytenr) - p = &(*p)->rb_right; - else - return parent; - } + return 0; +} - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; +static inline struct rb_node *rb_simple_insert(struct rb_root *root, + struct rb_simple_node *simple_node) +{ + return rb_find_add(&simple_node->rb_node, root, rb_simple_node_bytenr_cmp); } static inline bool bitmap_test_range_all_set(const unsigned long *addr, @@ -163,4 +160,9 @@ static inline bool bitmap_test_range_all_zero(const unsigned long *addr, return (found_set == start + nbits); } +static inline u64 folio_end(struct folio *folio) +{ + return folio_pos(folio) + folio_size(folio); +} + #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 9212ce110cde..2829f20d7bb5 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -359,7 +359,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, if (folio) { ASSERT(folio->mapping); ASSERT(folio_pos(folio) <= file_offset); - ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio)); + ASSERT(file_offset + len <= folio_end(folio)); /* * Ordered flag indicates whether we still have diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index fc821aa446f0..74e38da9bd39 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -190,7 +190,7 @@ static void print_uuid_item(const struct extent_buffer *l, unsigned long offset, u32 item_size) { if (!IS_ALIGNED(item_size, sizeof(u64))) { - pr_warn("BTRFS: uuid item with illegal size %lu!\n", + btrfs_warn(l->fs_info, "uuid item with illegal size %lu", (unsigned long)item_size); return; } @@ -223,7 +223,7 @@ static void print_eb_refs_lock(const struct extent_buffer *eb) { #ifdef CONFIG_BTRFS_DEBUG btrfs_info(eb->fs_info, "refs %u lock_owner %u current %u", - atomic_read(&eb->refs), eb->lock_owner, current->pid); + refcount_read(&eb->refs), eb->lock_owner, current->pid); #endif } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index b3176edbde82..1a5972178b3a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -160,23 +160,34 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, int init_flags); static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info); +static int btrfs_qgroup_qgroupid_key_cmp(const void *key, const struct rb_node *node) +{ + const u64 *qgroupid = key; + const struct btrfs_qgroup *qgroup = rb_entry(node, struct btrfs_qgroup, node); + + if (qgroup->qgroupid < *qgroupid) + return -1; + else if (qgroup->qgroupid > *qgroupid) + return 1; + + return 0; +} + /* must be called with qgroup_ioctl_lock held */ static struct btrfs_qgroup *find_qgroup_rb(const struct btrfs_fs_info *fs_info, u64 qgroupid) { - struct rb_node *n = fs_info->qgroup_tree.rb_node; - struct btrfs_qgroup *qgroup; + struct rb_node *node; - while (n) { - qgroup = rb_entry(n, struct btrfs_qgroup, node); - if (qgroup->qgroupid < qgroupid) - n = n->rb_left; - else if (qgroup->qgroupid > qgroupid) - n = n->rb_right; - else - return qgroup; - } - return NULL; + node = rb_find(&qgroupid, &fs_info->qgroup_tree, btrfs_qgroup_qgroupid_key_cmp); + return rb_entry_safe(node, struct btrfs_qgroup, node); +} + +static int btrfs_qgroup_qgroupid_cmp(struct rb_node *new, const struct rb_node *existing) +{ + const struct btrfs_qgroup *new_qgroup = rb_entry(new, struct btrfs_qgroup, node); + + return btrfs_qgroup_qgroupid_key_cmp(&new_qgroup->qgroupid, existing); } /* @@ -191,39 +202,25 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *prealloc, u64 qgroupid) { - struct rb_node **p = &fs_info->qgroup_tree.rb_node; - struct rb_node *parent = NULL; - struct btrfs_qgroup *qgroup; + struct rb_node *node; /* Caller must have pre-allocated @prealloc. */ ASSERT(prealloc); - while (*p) { - parent = *p; - qgroup = rb_entry(parent, struct btrfs_qgroup, node); - - if (qgroup->qgroupid < qgroupid) { - p = &(*p)->rb_left; - } else if (qgroup->qgroupid > qgroupid) { - p = &(*p)->rb_right; - } else { - kfree(prealloc); - return qgroup; - } + prealloc->qgroupid = qgroupid; + node = rb_find_add(&prealloc->node, &fs_info->qgroup_tree, btrfs_qgroup_qgroupid_cmp); + if (node) { + kfree(prealloc); + return rb_entry(node, struct btrfs_qgroup, node); } - qgroup = prealloc; - qgroup->qgroupid = qgroupid; - INIT_LIST_HEAD(&qgroup->groups); - INIT_LIST_HEAD(&qgroup->members); - INIT_LIST_HEAD(&qgroup->dirty); - INIT_LIST_HEAD(&qgroup->iterator); - INIT_LIST_HEAD(&qgroup->nested_iterator); + INIT_LIST_HEAD(&prealloc->groups); + INIT_LIST_HEAD(&prealloc->members); + INIT_LIST_HEAD(&prealloc->dirty); + INIT_LIST_HEAD(&prealloc->iterator); + INIT_LIST_HEAD(&prealloc->nested_iterator); - rb_link_node(&qgroup->node, parent, p); - rb_insert_color(&qgroup->node, &fs_info->qgroup_tree); - - return qgroup; + return prealloc; } static void __del_qgroup_rb(struct btrfs_qgroup *qgroup) @@ -349,13 +346,27 @@ int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid } #endif -static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info) +__printf(2, 3) +static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info, const char *fmt, ...) { + const u64 old_flags = fs_info->qgroup_flags; + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) return; fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT | BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); + if (!(old_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) { + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_warn_rl(fs_info, "qgroup marked inconsistent, %pV", &vaf); + va_end(args); + } } static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info, @@ -386,12 +397,6 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) if (!fs_info->quota_root) return 0; - fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); - if (!fs_info->qgroup_ulist) { - ret = -ENOMEM; - goto out; - } - path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; @@ -434,13 +439,10 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) goto out; } fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr); - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) { + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) qgroup_read_enable_gen(fs_info, l, slot, ptr); - } else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) { - qgroup_mark_inconsistent(fs_info); - btrfs_err(fs_info, - "qgroup generation mismatch, marked as inconsistent"); - } + else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) + qgroup_mark_inconsistent(fs_info, "qgroup generation mismatch"); rescan_progress = btrfs_qgroup_status_rescan(l, ptr); goto next1; } @@ -451,10 +453,8 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) qgroup = find_qgroup_rb(fs_info, found_key.offset); if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || - (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { - btrfs_err(fs_info, "inconsistent qgroup config"); - qgroup_mark_inconsistent(fs_info); - } + (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) + qgroup_mark_inconsistent(fs_info, "inconsistent qgroup config"); if (!qgroup) { struct btrfs_qgroup *prealloc; struct btrfs_root *tree_root = fs_info->tree_root; @@ -476,7 +476,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) * during mount before we start doing things like creating * subvolumes. */ - if (is_fstree(qgroup->qgroupid) && + if (btrfs_is_fstree(qgroup->qgroupid) && qgroup->qgroupid > tree_root->free_objectid) /* * Don't need to check against BTRFS_LAST_FREE_OBJECTID, @@ -581,8 +581,6 @@ out: if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ret = qgroup_rescan_init(fs_info, rescan_progress, 0); } else { - ulist_free(fs_info->qgroup_ulist); - fs_info->qgroup_ulist = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; btrfs_sysfs_del_qgroups(fs_info); } @@ -630,29 +628,30 @@ bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info) /* * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), - * first two are in single-threaded paths.And for the third one, we have set - * quota_root to be null with qgroup_lock held before, so it is safe to clean - * up the in-memory structures without qgroup_lock held. + * first two are in single-threaded paths. */ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) { struct rb_node *n; struct btrfs_qgroup *qgroup; + /* + * btrfs_quota_disable() can be called concurrently with + * btrfs_qgroup_rescan() -> qgroup_rescan_zero_tracking(), so take the + * lock. + */ + spin_lock(&fs_info->qgroup_lock); while ((n = rb_first(&fs_info->qgroup_tree))) { qgroup = rb_entry(n, struct btrfs_qgroup, node); rb_erase(n, &fs_info->qgroup_tree); __del_qgroup_rb(qgroup); + spin_unlock(&fs_info->qgroup_lock); btrfs_sysfs_del_one_qgroup(fs_info, qgroup); kfree(qgroup); + spin_lock(&fs_info->qgroup_lock); } - /* - * We call btrfs_free_qgroup_config() when unmounting - * filesystem and disabling quota, so we set qgroup_ulist - * to be null here to avoid double free. - */ - ulist_free(fs_info->qgroup_ulist); - fs_info->qgroup_ulist = NULL; + spin_unlock(&fs_info->qgroup_lock); + btrfs_sysfs_del_qgroups(fs_info); } @@ -998,7 +997,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup = NULL; struct btrfs_qgroup *prealloc = NULL; struct btrfs_trans_handle *trans = NULL; - struct ulist *ulist = NULL; const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA); int ret = 0; int slot; @@ -1021,12 +1019,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, if (fs_info->quota_root) goto out; - ulist = ulist_alloc(GFP_KERNEL); - if (!ulist) { - ret = -ENOMEM; - goto out; - } - ret = btrfs_sysfs_add_qgroups(fs_info); if (ret < 0) goto out; @@ -1066,9 +1058,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, if (fs_info->quota_root) goto out; - fs_info->qgroup_ulist = ulist; - ulist = NULL; - /* * initially create the quota tree */ @@ -1155,11 +1144,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info, qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); prealloc = NULL; - if (IS_ERR(qgroup)) { - ret = PTR_ERR(qgroup); - btrfs_abort_transaction(trans, ret); - goto out_free_path; - } ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -1272,17 +1256,13 @@ out_free_root: if (ret) btrfs_put_root(quota_root); out: - if (ret) { - ulist_free(fs_info->qgroup_ulist); - fs_info->qgroup_ulist = NULL; + if (ret) btrfs_sysfs_del_qgroups(fs_info); - } mutex_unlock(&fs_info->qgroup_ioctl_lock); if (ret && trans) btrfs_end_transaction(trans); else if (trans) ret = btrfs_end_transaction(trans); - ulist_free(ulist); kfree(prealloc); return ret; } @@ -1354,11 +1334,14 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) /* * We have nothing held here and no trans handle, just return the error - * if there is one. + * if there is one and set back the quota enabled bit since we didn't + * actually disable quotas. */ ret = flush_reservations(fs_info); - if (ret) + if (ret) { + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); return ret; + } /* * 1 For the root item @@ -1679,9 +1662,6 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) struct btrfs_qgroup *prealloc = NULL; int ret = 0; - if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) - return 0; - mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { ret = -ENOTCONN; @@ -1844,13 +1824,12 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) if (qgroup->rfer || qgroup->excl || qgroup->rfer_cmpr || qgroup->excl_cmpr) { DEBUG_WARN(); - btrfs_warn_rl(fs_info, -"to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu", - btrfs_qgroup_level(qgroup->qgroupid), - btrfs_qgroup_subvolid(qgroup->qgroupid), - qgroup->rfer, qgroup->rfer_cmpr, - qgroup->excl, qgroup->excl_cmpr); - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, + "to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu", + btrfs_qgroup_level(qgroup->qgroupid), + btrfs_qgroup_subvolid(qgroup->qgroupid), + qgroup->rfer, qgroup->rfer_cmpr, + qgroup->excl, qgroup->excl_cmpr); } } del_qgroup_rb(fs_info, qgroupid); @@ -1873,7 +1852,8 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su struct btrfs_trans_handle *trans; int ret; - if (!is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) || !fs_info->quota_root) + if (!btrfs_is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) || + !fs_info->quota_root) return 0; /* @@ -1968,11 +1948,8 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, spin_unlock(&fs_info->qgroup_lock); ret = update_qgroup_limit_item(trans, qgroup); - if (ret) { - qgroup_mark_inconsistent(fs_info); - btrfs_info(fs_info, "unable to update quota limit for %llu", - qgroupid); - } + if (ret) + qgroup_mark_inconsistent(fs_info, "qgroup item update error %d", ret); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); @@ -2027,7 +2004,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, ret = __xa_store(&delayed_refs->dirty_extents, index, record, GFP_ATOMIC); xa_unlock(&delayed_refs->dirty_extents); if (xa_is_err(ret)) { - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, "xarray insert error: %d", xa_err(ret)); return xa_err(ret); } @@ -2094,10 +2071,8 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, ret = btrfs_find_all_roots(&ctx, true); if (ret < 0) { - qgroup_mark_inconsistent(fs_info); - btrfs_warn(fs_info, -"error accounting new delayed refs extent (err code: %d), quota inconsistent", - ret); + qgroup_mark_inconsistent(fs_info, + "error accounting new delayed refs extent: %d", ret); return 0; } @@ -2341,7 +2316,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0); /* For src_path */ - atomic_inc(&src_eb->refs); + refcount_inc(&src_eb->refs); src_path->nodes[root_level] = src_eb; src_path->slots[root_level] = dst_path->slots[root_level]; src_path->locks[root_level] = 0; @@ -2574,7 +2549,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, goto out; } /* For dst_path */ - atomic_inc(&dst_eb->refs); + refcount_inc(&dst_eb->refs); dst_path->nodes[level] = dst_eb; dst_path->slots[level] = 0; dst_path->locks[level] = 0; @@ -2589,7 +2564,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, out: btrfs_free_path(dst_path); if (ret < 0) - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, "%s error: %d", __func__, ret); return ret; } @@ -2633,7 +2608,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, * mark qgroup inconsistent. */ if (root_level >= drop_subptree_thres) { - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, "subtree level reached threshold"); return 0; } @@ -2666,7 +2641,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, * walk back up the tree (adjusting slot pointers as we go) * and restart the search process. */ - atomic_inc(&root_eb->refs); /* For path */ + refcount_inc(&root_eb->refs); /* For path */ path->nodes[root_level] = root_eb; path->slots[root_level] = 0; path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ @@ -2932,7 +2907,7 @@ static int maybe_fs_roots(struct ulist *roots) * trees. * If it contains a non-fs tree, it won't be shared with fs/subvol trees. */ - return is_fstree(unode->val); + return btrfs_is_fstree(unode->val); } int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, @@ -3133,10 +3108,12 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) spin_unlock(&fs_info->qgroup_lock); ret = update_qgroup_info_item(trans, qgroup); if (ret) - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, + "qgroup info item update error %d", ret); ret = update_qgroup_limit_item(trans, qgroup); if (ret) - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, + "qgroup limit item update error %d", ret); spin_lock(&fs_info->qgroup_lock); } if (btrfs_qgroup_enabled(fs_info)) @@ -3147,7 +3124,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) ret = update_qgroup_status_item(trans); if (ret) - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, + "qgroup status item update error %d", ret); return ret; } @@ -3329,6 +3307,9 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, u32 level_size = 0; u64 nums; + if (!btrfs_qgroup_enabled(fs_info)) + return 0; + prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); if (!prealloc) return -ENOMEM; @@ -3352,8 +3333,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, if (!committing) mutex_lock(&fs_info->qgroup_ioctl_lock); - if (!btrfs_qgroup_enabled(fs_info)) - goto out; quota_root = fs_info->quota_root; if (!quota_root) { @@ -3554,7 +3533,7 @@ out: if (!committing) mutex_unlock(&fs_info->qgroup_ioctl_lock); if (need_rescan) - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, "qgroup inherit needs a rescan"); if (qlist_prealloc) { for (int i = 0; i < inherit->num_qgroups; i++) kfree(qlist_prealloc[i]); @@ -3588,7 +3567,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, int ret = 0; LIST_HEAD(qgroup_list); - if (!is_fstree(ref_root)) + if (!btrfs_is_fstree(ref_root)) return 0; if (num_bytes == 0) @@ -3648,7 +3627,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup; LIST_HEAD(qgroup_list); - if (!is_fstree(ref_root)) + if (!btrfs_is_fstree(ref_root)) return; if (num_bytes == 0) @@ -4036,12 +4015,21 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) qgroup_rescan_zero_tracking(fs_info); mutex_lock(&fs_info->qgroup_rescan_lock); - fs_info->qgroup_rescan_running = true; - btrfs_queue_work(fs_info->qgroup_rescan_workers, - &fs_info->qgroup_rescan_work); + /* + * The rescan worker is only for full accounting qgroups, check if it's + * enabled as it is pointless to queue it otherwise. A concurrent quota + * disable may also have just cleared BTRFS_FS_QUOTA_ENABLED. + */ + if (btrfs_qgroup_full_accounting(fs_info)) { + fs_info->qgroup_rescan_running = true; + btrfs_queue_work(fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); + } else { + ret = -ENOTCONN; + } mutex_unlock(&fs_info->qgroup_rescan_lock); - return 0; + return ret; } int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, @@ -4128,8 +4116,8 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode, * Now the entry is in [start, start + len), revert the * EXTENT_QGROUP_RESERVED bit. */ - clear_ret = btrfs_clear_extent_bits(&inode->io_tree, entry_start, - entry_end, EXTENT_QGROUP_RESERVED); + clear_ret = btrfs_clear_extent_bit(&inode->io_tree, entry_start, entry_end, + EXTENT_QGROUP_RESERVED, NULL); if (!ret && clear_ret < 0) ret = clear_ret; @@ -4216,7 +4204,7 @@ static int qgroup_reserve_data(struct btrfs_inode *inode, int ret; if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(btrfs_root_id(root)) || len == 0) + !btrfs_is_fstree(btrfs_root_id(root)) || len == 0) return 0; /* @reserved parameter is mandatory for qgroup */ @@ -4469,7 +4457,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, int ret; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(btrfs_root_id(root)) || num_bytes == 0) + !btrfs_is_fstree(btrfs_root_id(root)) || num_bytes == 0) return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); @@ -4514,7 +4502,7 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) struct btrfs_fs_info *fs_info = root->fs_info; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(btrfs_root_id(root))) + !btrfs_is_fstree(btrfs_root_id(root))) return; /* TODO: Update trace point to handle such free */ @@ -4530,7 +4518,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, struct btrfs_fs_info *fs_info = root->fs_info; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(btrfs_root_id(root))) + !btrfs_is_fstree(btrfs_root_id(root))) return; /* @@ -4589,7 +4577,7 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) struct btrfs_fs_info *fs_info = root->fs_info; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(btrfs_root_id(root))) + !btrfs_is_fstree(btrfs_root_id(root))) return; /* Same as btrfs_qgroup_free_meta_prealloc() */ num_bytes = sub_root_meta_rsv(root, num_bytes, @@ -4673,6 +4661,28 @@ out: spin_unlock(&swapped_blocks->lock); } +static int qgroup_swapped_block_bytenr_key_cmp(const void *key, const struct rb_node *node) +{ + const u64 *bytenr = key; + const struct btrfs_qgroup_swapped_block *block = rb_entry(node, + struct btrfs_qgroup_swapped_block, node); + + if (block->subvol_bytenr < *bytenr) + return -1; + else if (block->subvol_bytenr > *bytenr) + return 1; + + return 0; +} + +static int qgroup_swapped_block_bytenr_cmp(struct rb_node *new, const struct rb_node *existing) +{ + const struct btrfs_qgroup_swapped_block *new_block = rb_entry(new, + struct btrfs_qgroup_swapped_block, node); + + return qgroup_swapped_block_bytenr_key_cmp(&new_block->subvol_bytenr, existing); +} + /* * Add subtree roots record into @subvol_root. * @@ -4692,8 +4702,7 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, struct btrfs_fs_info *fs_info = subvol_root->fs_info; struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks; struct btrfs_qgroup_swapped_block *block; - struct rb_node **cur; - struct rb_node *parent = NULL; + struct rb_node *node; int level = btrfs_header_level(subvol_parent) - 1; int ret = 0; @@ -4742,46 +4751,32 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, /* Insert @block into @blocks */ spin_lock(&blocks->lock); - cur = &blocks->blocks[level].rb_node; - while (*cur) { + node = rb_find_add(&block->node, &blocks->blocks[level], qgroup_swapped_block_bytenr_cmp); + if (node) { struct btrfs_qgroup_swapped_block *entry; - parent = *cur; - entry = rb_entry(parent, struct btrfs_qgroup_swapped_block, - node); + entry = rb_entry(node, struct btrfs_qgroup_swapped_block, node); - if (entry->subvol_bytenr < block->subvol_bytenr) { - cur = &(*cur)->rb_left; - } else if (entry->subvol_bytenr > block->subvol_bytenr) { - cur = &(*cur)->rb_right; - } else { - if (entry->subvol_generation != - block->subvol_generation || - entry->reloc_bytenr != block->reloc_bytenr || - entry->reloc_generation != - block->reloc_generation) { - /* - * Duplicated but mismatch entry found. - * Shouldn't happen. - * - * Marking qgroup inconsistent should be enough - * for end users. - */ - DEBUG_WARN("duplicated but mismatched entry found"); - ret = -EEXIST; - } - kfree(block); - goto out_unlock; + if (entry->subvol_generation != block->subvol_generation || + entry->reloc_bytenr != block->reloc_bytenr || + entry->reloc_generation != block->reloc_generation) { + /* + * Duplicated but mismatch entry found. Shouldn't happen. + * Marking qgroup inconsistent should be enough for end + * users. + */ + DEBUG_WARN("duplicated but mismatched entry found"); + ret = -EEXIST; } + kfree(block); + goto out_unlock; } - rb_link_node(&block->node, parent, cur); - rb_insert_color(&block->node, &blocks->blocks[level]); blocks->swapped = true; out_unlock: spin_unlock(&blocks->lock); out: if (ret < 0) - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, "%s error: %d", __func__, ret); return ret; } @@ -4801,7 +4796,6 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_qgroup_swapped_block *block; struct extent_buffer *reloc_eb = NULL; struct rb_node *node; - bool found = false; bool swapped = false; int level = btrfs_header_level(subvol_eb); int ret = 0; @@ -4809,7 +4803,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, if (!btrfs_qgroup_full_accounting(fs_info)) return 0; - if (!is_fstree(btrfs_root_id(root)) || !root->reloc_root) + if (!btrfs_is_fstree(btrfs_root_id(root)) || !root->reloc_root) return 0; spin_lock(&blocks->lock); @@ -4817,23 +4811,14 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, spin_unlock(&blocks->lock); return 0; } - node = blocks->blocks[level].rb_node; - - while (node) { - block = rb_entry(node, struct btrfs_qgroup_swapped_block, node); - if (block->subvol_bytenr < subvol_eb->start) { - node = node->rb_left; - } else if (block->subvol_bytenr > subvol_eb->start) { - node = node->rb_right; - } else { - found = true; - break; - } - } - if (!found) { + node = rb_find(&subvol_eb->start, &blocks->blocks[level], + qgroup_swapped_block_bytenr_key_cmp); + if (!node) { spin_unlock(&blocks->lock); goto out; } + block = rb_entry(node, struct btrfs_qgroup_swapped_block, node); + /* Found one, remove it from @blocks first and update blocks->swapped */ rb_erase(&block->node, &blocks->blocks[level]); for (i = 0; i < BTRFS_MAX_LEVEL; i++) { @@ -4869,10 +4854,9 @@ free_out: free_extent_buffer(reloc_eb); out: if (ret < 0) { - btrfs_err_rl(fs_info, - "failed to account subtree at bytenr %llu: %d", - subvol_eb->start, ret); - qgroup_mark_inconsistent(fs_info); + qgroup_mark_inconsistent(fs_info, + "failed to account subtree at bytenr %llu: %d", + subvol_eb->start, ret); } return ret; } @@ -4903,7 +4887,7 @@ int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) return 0; - if (!is_fstree(root)) + if (!btrfs_is_fstree(root)) return 0; /* If the extent predates enabling quotas, don't count it. */ diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 1834011ccc49..cab0b291088c 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -329,11 +329,14 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent, item_size); - if (ret == -EEXIST) + if (ret == -EEXIST) { ret = update_raid_extent_item(trans, &stripe_key, stripe_extent, item_size); - if (ret) + if (ret) + btrfs_abort_transaction(trans, ret); + } else if (ret) { btrfs_abort_transaction(trans, ret); + } kfree(stripe_extent); diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h deleted file mode 100644 index 1c2d7cb1fe6f..000000000000 --- a/fs/btrfs/rcu-string.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2012 Red Hat. All rights reserved. - */ - -#ifndef BTRFS_RCU_STRING_H -#define BTRFS_RCU_STRING_H - -#include <linux/types.h> -#include <linux/string.h> -#include <linux/slab.h> -#include <linux/rcupdate.h> -#include <linux/printk.h> - -struct rcu_string { - struct rcu_head rcu; - char str[]; -}; - -static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask) -{ - size_t len = strlen(src) + 1; - struct rcu_string *ret = kzalloc(sizeof(struct rcu_string) + - (len * sizeof(char)), mask); - if (!ret) - return ret; - /* Warn if the source got unexpectedly truncated. */ - if (WARN_ON(strscpy(ret->str, src, len) < 0)) { - kfree(ret); - return NULL; - } - return ret; -} - -static inline void rcu_string_free(struct rcu_string *str) -{ - if (str) - kfree_rcu(str, rcu); -} - -#define printk_in_rcu(fmt, ...) do { \ - rcu_read_lock(); \ - printk(fmt, __VA_ARGS__); \ - rcu_read_unlock(); \ -} while (0) - -#define printk_ratelimited_in_rcu(fmt, ...) do { \ - rcu_read_lock(); \ - printk_ratelimited(fmt, __VA_ARGS__); \ - rcu_read_unlock(); \ -} while (0) - -#define rcu_str_deref(rcu_str) ({ \ - struct rcu_string *__str = rcu_dereference(rcu_str); \ - __str->str; \ -}) - -#endif diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 2928abf7eb82..3871c3a6c743 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -75,69 +75,70 @@ struct block_entry { struct list_head actions; }; +static int block_entry_bytenr_key_cmp(const void *key, const struct rb_node *node) +{ + const u64 *bytenr = key; + const struct block_entry *entry = rb_entry(node, struct block_entry, node); + + if (entry->bytenr < *bytenr) + return 1; + else if (entry->bytenr > *bytenr) + return -1; + + return 0; +} + +static int block_entry_bytenr_cmp(struct rb_node *new, const struct rb_node *existing) +{ + const struct block_entry *new_entry = rb_entry(new, struct block_entry, node); + + return block_entry_bytenr_key_cmp(&new_entry->bytenr, existing); +} + static struct block_entry *insert_block_entry(struct rb_root *root, struct block_entry *be) { - struct rb_node **p = &root->rb_node; - struct rb_node *parent_node = NULL; - struct block_entry *entry; - - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct block_entry, node); - if (entry->bytenr > be->bytenr) - p = &(*p)->rb_left; - else if (entry->bytenr < be->bytenr) - p = &(*p)->rb_right; - else - return entry; - } + struct rb_node *node; - rb_link_node(&be->node, parent_node, p); - rb_insert_color(&be->node, root); - return NULL; + node = rb_find_add(&be->node, root, block_entry_bytenr_cmp); + return rb_entry_safe(node, struct block_entry, node); } static struct block_entry *lookup_block_entry(struct rb_root *root, u64 bytenr) { - struct rb_node *n; - struct block_entry *entry = NULL; + struct rb_node *node; - n = root->rb_node; - while (n) { - entry = rb_entry(n, struct block_entry, node); - if (entry->bytenr < bytenr) - n = n->rb_right; - else if (entry->bytenr > bytenr) - n = n->rb_left; - else - return entry; - } - return NULL; + node = rb_find(&bytenr, root, block_entry_bytenr_key_cmp); + return rb_entry_safe(node, struct block_entry, node); +} + +static int root_entry_root_objectid_key_cmp(const void *key, const struct rb_node *node) +{ + const u64 *objectid = key; + const struct root_entry *entry = rb_entry(node, struct root_entry, node); + + if (entry->root_objectid < *objectid) + return 1; + else if (entry->root_objectid > *objectid) + return -1; + + return 0; +} + +static int root_entry_root_objectid_cmp(struct rb_node *new, const struct rb_node *existing) +{ + const struct root_entry *new_entry = rb_entry(new, struct root_entry, node); + + return root_entry_root_objectid_key_cmp(&new_entry->root_objectid, existing); } static struct root_entry *insert_root_entry(struct rb_root *root, struct root_entry *re) { - struct rb_node **p = &root->rb_node; - struct rb_node *parent_node = NULL; - struct root_entry *entry; - - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct root_entry, node); - if (entry->root_objectid > re->root_objectid) - p = &(*p)->rb_left; - else if (entry->root_objectid < re->root_objectid) - p = &(*p)->rb_right; - else - return entry; - } - - rb_link_node(&re->node, parent_node, p); - rb_insert_color(&re->node, root); - return NULL; + struct rb_node *node; + node = rb_find_add(&re->node, root, root_entry_root_objectid_cmp); + return rb_entry_safe(node, struct root_entry, node); } static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2) @@ -161,48 +162,29 @@ static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2) return 0; } +static int ref_entry_cmp(struct rb_node *new, const struct rb_node *existing) +{ + struct ref_entry *new_entry = rb_entry(new, struct ref_entry, node); + struct ref_entry *existing_entry = rb_entry(existing, struct ref_entry, node); + + return comp_refs(new_entry, existing_entry); +} + static struct ref_entry *insert_ref_entry(struct rb_root *root, struct ref_entry *ref) { - struct rb_node **p = &root->rb_node; - struct rb_node *parent_node = NULL; - struct ref_entry *entry; - int cmp; - - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct ref_entry, node); - cmp = comp_refs(entry, ref); - if (cmp > 0) - p = &(*p)->rb_left; - else if (cmp < 0) - p = &(*p)->rb_right; - else - return entry; - } - - rb_link_node(&ref->node, parent_node, p); - rb_insert_color(&ref->node, root); - return NULL; + struct rb_node *node; + node = rb_find_add(&ref->node, root, ref_entry_cmp); + return rb_entry_safe(node, struct ref_entry, node); } static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid) { - struct rb_node *n; - struct root_entry *entry = NULL; + struct rb_node *node; - n = root->rb_node; - while (n) { - entry = rb_entry(n, struct root_entry, node); - if (entry->root_objectid < objectid) - n = n->rb_right; - else if (entry->root_objectid > objectid) - n = n->rb_left; - else - return entry; - } - return NULL; + node = rb_find(&objectid, root, root_entry_root_objectid_key_cmp); + return rb_entry_safe(node, struct root_entry, node); } #ifdef CONFIG_STACKTRACE @@ -668,7 +650,7 @@ static void dump_block_entry(struct btrfs_fs_info *fs_info, * our sanity checks pass as they are no longer needed. */ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, - struct btrfs_ref *generic_ref) + const struct btrfs_ref *generic_ref) { struct ref_entry *ref = NULL, *exist; struct ref_action *ra = NULL; diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h index 3511e1a5c96b..559bd25a2b7a 100644 --- a/fs/btrfs/ref-verify.h +++ b/fs/btrfs/ref-verify.h @@ -19,7 +19,7 @@ struct btrfs_ref; int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info); void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info); int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, - struct btrfs_ref *generic_ref); + const struct btrfs_ref *generic_ref); void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, u64 len); @@ -39,7 +39,7 @@ static inline void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info) } static inline int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, - struct btrfs_ref *generic_ref) + const struct btrfs_ref *generic_ref) { return 0; } diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 62161beca559..ce25ab7f0e99 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -46,11 +46,9 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); - goto out; + return ret; } - ret = btrfs_end_transaction(trans); -out: - return ret; + return btrfs_end_transaction(trans); } static int copy_inline_to_page(struct btrfs_inode *inode, @@ -95,8 +93,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode, if (ret < 0) goto out_unlock; - btrfs_clear_extent_bits(&inode->io_tree, file_offset, range_end, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG); + btrfs_clear_extent_bit(&inode->io_tree, file_offset, range_end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, NULL); ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); if (ret) goto out_unlock; @@ -270,11 +268,15 @@ copy_inline_extent: drop_args.end = aligned_end; drop_args.drop_cache = true; ret = btrfs_drop_extents(trans, root, inode, &drop_args); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } ret = btrfs_insert_empty_item(trans, root, path, new_key, size); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } write_extent_buffer(path->nodes[0], inline_data, btrfs_item_ptr_offset(path->nodes[0], @@ -283,6 +285,8 @@ copy_inline_extent: btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found); btrfs_set_inode_full_sync(inode); ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end); + if (ret) + btrfs_abort_transaction(trans, ret); out: if (!ret && !trans) { /* @@ -297,10 +301,8 @@ out: trans = NULL; } } - if (ret && trans) { - btrfs_abort_transaction(trans, ret); + if (ret && trans) btrfs_end_transaction(trans); - } if (!ret) *trans_out = trans; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 02086191630d..e58151933844 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -90,10 +90,15 @@ * map address of tree root to tree */ struct mapping_node { - struct { - struct rb_node rb_node; - u64 bytenr; - }; /* Use rb_simle_node for search/insert */ + union { + /* Use rb_simple_node for search/insert */ + struct { + struct rb_node rb_node; + u64 bytenr; + }; + + struct rb_simple_node simple_node; + }; void *data; }; @@ -106,10 +111,15 @@ struct mapping_tree { * present a tree block to process */ struct tree_block { - struct { - struct rb_node rb_node; - u64 bytenr; - }; /* Use rb_simple_node for search/insert */ + union { + /* Use rb_simple_node for search/insert */ + struct { + struct rb_node rb_node; + u64 bytenr; + }; + + struct rb_simple_node simple_node; + }; u64 owner; struct btrfs_key key; u8 level; @@ -480,8 +490,7 @@ static int __add_reloc_root(struct btrfs_root *root) node->data = root; spin_lock(&rc->reloc_root_tree.lock); - rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, - node->bytenr, &node->rb_node); + rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node); spin_unlock(&rc->reloc_root_tree.lock); if (rb_node) { btrfs_err(fs_info, @@ -564,8 +573,7 @@ static int __update_reloc_root(struct btrfs_root *root) spin_lock(&rc->reloc_root_tree.lock); node->bytenr = root->node->start; - rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, - node->bytenr, &node->rb_node); + rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node); spin_unlock(&rc->reloc_root_tree.lock); if (rb_node) btrfs_backref_panic(fs_info, node->bytenr, -EEXIST); @@ -1516,7 +1524,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_root_level(root_item); - atomic_inc(&reloc_root->node->refs); + refcount_inc(&reloc_root->node->refs); path->nodes[level] = reloc_root->node; path->slots[level] = 0; } else { @@ -2617,7 +2625,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, * tree. */ if (block->owner && - (!is_fstree(block->owner) || + (!btrfs_is_fstree(block->owner) || block->owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) { ret = relocate_cowonly_block(trans, rc, block, path); if (ret) @@ -2658,66 +2666,24 @@ static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control u64 num_bytes; int nr; int ret = 0; - u64 i_size = i_size_read(&inode->vfs_inode); u64 prealloc_start = cluster->start - offset; u64 prealloc_end = cluster->end - offset; u64 cur_offset = prealloc_start; /* - * For subpage case, previous i_size may not be aligned to PAGE_SIZE. - * This means the range [i_size, PAGE_END + 1) is filled with zeros by - * btrfs_do_readpage() call of previously relocated file cluster. + * For blocksize < folio size case (either bs < page size or large folios), + * beyond i_size, all blocks are filled with zero. * - * If the current cluster starts in the above range, btrfs_do_readpage() + * If the current cluster covers the above range, btrfs_do_readpage() * will skip the read, and relocate_one_folio() will later writeback * the padding zeros as new data, causing data corruption. * - * Here we have to manually invalidate the range (i_size, PAGE_END + 1). + * Here we have to invalidate the cache covering our cluster. */ - if (!PAGE_ALIGNED(i_size)) { - struct address_space *mapping = inode->vfs_inode.i_mapping; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - const u32 sectorsize = fs_info->sectorsize; - struct folio *folio; - - ASSERT(sectorsize < PAGE_SIZE); - ASSERT(IS_ALIGNED(i_size, sectorsize)); - - /* - * Subpage can't handle page with DIRTY but without UPTODATE - * bit as it can lead to the following deadlock: - * - * btrfs_read_folio() - * | Page already *locked* - * |- btrfs_lock_and_flush_ordered_range() - * |- btrfs_start_ordered_extent() - * |- extent_write_cache_pages() - * |- lock_page() - * We try to lock the page we already hold. - * - * Here we just writeback the whole data reloc inode, so that - * we will be ensured to have no dirty range in the page, and - * are safe to clear the uptodate bits. - * - * This shouldn't cause too much overhead, as we need to write - * the data back anyway. - */ - ret = filemap_write_and_wait(mapping); - if (ret < 0) - return ret; - - folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT); - /* - * If page is freed we don't need to do anything then, as we - * will re-read the whole page anyway. - */ - if (!IS_ERR(folio)) { - btrfs_subpage_clear_uptodate(fs_info, folio, i_size, - round_up(i_size, PAGE_SIZE) - i_size); - folio_unlock(folio); - folio_put(folio); - } - } + ret = filemap_invalidate_inode(&inode->vfs_inode, true, prealloc_start, + prealloc_end); + if (ret < 0) + return ret; BUG_ON(cluster->start != cluster->boundary[0]); ret = btrfs_alloc_data_chunk_ondemand(inode, @@ -2806,13 +2772,15 @@ static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster, static int relocate_one_folio(struct reloc_control *rc, struct file_ra_state *ra, - int *cluster_nr, unsigned long index) + int *cluster_nr, u64 *file_offset_ret) { const struct file_extent_cluster *cluster = &rc->cluster; struct inode *inode = rc->data_inode; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + const u64 orig_file_offset = *file_offset_ret; u64 offset = BTRFS_I(inode)->reloc_block_group_start; - const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT; + const pgoff_t last_index = (cluster->end - offset) >> PAGE_SHIFT; + const pgoff_t index = orig_file_offset >> PAGE_SHIFT; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); struct folio *folio; u64 folio_start; @@ -2845,8 +2813,6 @@ again: return PTR_ERR(folio); } - WARN_ON(folio_order(folio)); - if (folio_test_readahead(folio) && !use_rst) page_cache_async_readahead(inode->i_mapping, ra, NULL, folio, last_index + 1 - index); @@ -2875,7 +2841,7 @@ again: goto release_folio; folio_start = folio_pos(folio); - folio_end = folio_start + PAGE_SIZE - 1; + folio_end = folio_start + folio_size(folio) - 1; /* * Start from the cluster, as for subpage case, the cluster can start @@ -2923,7 +2889,8 @@ again: * EXTENT_BOUNDARY bit prevents current extent from being merged * with previous extent. */ - if (in_range(cluster->boundary[*cluster_nr] - offset, folio_start, PAGE_SIZE)) { + if (in_range(cluster->boundary[*cluster_nr] - offset, + folio_start, folio_size(folio))) { u64 boundary_start = cluster->boundary[*cluster_nr] - offset; u64 boundary_end = boundary_start + @@ -2953,6 +2920,7 @@ again: btrfs_throttle(fs_info); if (btrfs_should_cancel_balance(fs_info)) ret = -ECANCELED; + *file_offset_ret = folio_end + 1; return ret; release_folio: @@ -2966,8 +2934,7 @@ static int relocate_file_extent_cluster(struct reloc_control *rc) struct inode *inode = rc->data_inode; const struct file_extent_cluster *cluster = &rc->cluster; u64 offset = BTRFS_I(inode)->reloc_block_group_start; - unsigned long index; - unsigned long last_index; + u64 cur_file_offset = cluster->start - offset; struct file_ra_state *ra; int cluster_nr = 0; int ret = 0; @@ -2989,10 +2956,11 @@ static int relocate_file_extent_cluster(struct reloc_control *rc) if (ret) goto out; - last_index = (cluster->end - offset) >> PAGE_SHIFT; - for (index = (cluster->start - offset) >> PAGE_SHIFT; - index <= last_index && !ret; index++) - ret = relocate_one_folio(rc, ra, &cluster_nr, index); + while (cur_file_offset < cluster->end - offset) { + ret = relocate_one_folio(rc, ra, &cluster_nr, &cur_file_offset); + if (ret) + break; + } if (ret == 0) WARN_ON(cluster_nr != cluster->nr); out: @@ -3155,7 +3123,7 @@ static int add_tree_block(struct reloc_control *rc, block->key_ready = false; block->owner = owner; - rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); + rb_node = rb_simple_insert(blocks, &block->simple_node); if (rb_node) btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr, -EEXIST); @@ -3643,7 +3611,7 @@ restart: } btrfs_release_path(path); - btrfs_clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); + btrfs_clear_extent_bit(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, NULL); if (trans) { btrfs_end_transaction_throttle(trans); @@ -3880,7 +3848,7 @@ static void free_reloc_control(struct reloc_control *rc) */ static void describe_relocation(struct btrfs_block_group *block_group) { - char buf[128] = {'\0'}; + char buf[128] = "NONE"; btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); @@ -3900,7 +3868,8 @@ static const char *stage_to_string(enum reloc_stage stage) /* * function to relocate all extents in a block group. */ -int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) +int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, + bool verbose) { struct btrfs_block_group *bg; struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start); @@ -3992,7 +3961,8 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) goto out; } - describe_relocation(rc->block_group); + if (verbose) + describe_relocation(rc->block_group); btrfs_wait_block_group_reservations(rc->block_group); btrfs_wait_nocow_writers(rc->block_group); @@ -4036,8 +4006,10 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) if (rc->extents_found == 0) break; - btrfs_info(fs_info, "found %llu extents, stage: %s", - rc->extents_found, stage_to_string(finishes_stage)); + if (verbose) + btrfs_info(fs_info, "found %llu extents, stage: %s", + rc->extents_found, + stage_to_string(finishes_stage)); } WARN_ON(rc->block_group->pinned > 0); @@ -4339,7 +4311,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, } btrfs_backref_drop_node_buffer(node); - atomic_inc(&cow->refs); + refcount_inc(&cow->refs); node->eb = cow; node->new_bytenr = cow->start; diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h index 788c86d8633a..5c36b3f84b57 100644 --- a/fs/btrfs/relocation.h +++ b/fs/btrfs/relocation.h @@ -12,7 +12,8 @@ struct btrfs_trans_handle; struct btrfs_ordered_extent; struct btrfs_pending_snapshot; -int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start); +int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, + bool verbose); int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ce36fafc771e..6776e6ab8d10 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -556,8 +556,8 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, * hold all of the paths here */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) - btrfs_warn_in_rcu(fs_info, -"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)", + btrfs_warn(fs_info, +"scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu length %u links %u (path: %s)", swarn->errstr, swarn->logical, btrfs_dev_name(swarn->dev), swarn->physical, @@ -570,8 +570,8 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, return 0; err: - btrfs_warn_in_rcu(fs_info, - "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", + btrfs_warn(fs_info, + "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu: path resolving failed with ret=%d", swarn->errstr, swarn->logical, btrfs_dev_name(swarn->dev), swarn->physical, @@ -596,7 +596,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * /* Super block error, no need to search extent tree. */ if (is_super) { - btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu", + btrfs_warn(fs_info, "scrub: %s on device %s, physical %llu", errstr, btrfs_dev_name(dev), physical); return; } @@ -631,14 +631,14 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * &ref_level); if (ret < 0) { btrfs_warn(fs_info, - "failed to resolve tree backref for logical %llu: %d", - swarn.logical, ret); + "scrub: failed to resolve tree backref for logical %llu: %d", + swarn.logical, ret); break; } if (ret > 0) break; - btrfs_warn_in_rcu(fs_info, -"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", + btrfs_warn(fs_info, +"scrub: %s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", errstr, swarn.logical, btrfs_dev_name(dev), swarn.physical, (ref_level ? "node" : "leaf"), ref_level, ref_root); @@ -718,7 +718,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad bytenr, has %llu want %llu", + "scrub: tree block %llu mirror %u has bad bytenr, has %llu want %llu", logical, stripe->mirror_num, btrfs_stack_header_bytenr(header), logical); return; @@ -728,7 +728,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad fsid, has %pU want %pU", + "scrub: tree block %llu mirror %u has bad fsid, has %pU want %pU", logical, stripe->mirror_num, header->fsid, fs_info->fs_devices->fsid); return; @@ -738,7 +738,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", + "scrub: tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", logical, stripe->mirror_num, header->chunk_tree_uuid, fs_info->chunk_tree_uuid); return; @@ -760,7 +760,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, +"scrub: tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, logical, stripe->mirror_num, CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); @@ -771,7 +771,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_gen_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad generation, has %llu want %llu", + "scrub: tree block %llu mirror %u has bad generation, has %llu want %llu", logical, stripe->mirror_num, btrfs_stack_header_generation(header), stripe->sectors[sector_nr].generation); @@ -814,7 +814,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) */ if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) { btrfs_warn_rl(fs_info, - "tree block at %llu crosses stripe boundary %llu", + "scrub: tree block at %llu crosses stripe boundary %llu", stripe->logical + (sector_nr << fs_info->sectorsize_bits), stripe->logical); @@ -1045,13 +1045,13 @@ skip: */ if (repaired) { if (dev) { - btrfs_err_rl_in_rcu(fs_info, - "fixed up error at logical %llu on dev %s physical %llu", + btrfs_err_rl(fs_info, + "scrub: fixed up error at logical %llu on dev %s physical %llu", stripe->logical, btrfs_dev_name(dev), physical); } else { - btrfs_err_rl_in_rcu(fs_info, - "fixed up error at logical %llu on mirror %u", + btrfs_err_rl(fs_info, + "scrub: fixed up error at logical %llu on mirror %u", stripe->logical, stripe->mirror_num); } continue; @@ -1059,13 +1059,13 @@ skip: /* The remaining are all for unrepaired. */ if (dev) { - btrfs_err_rl_in_rcu(fs_info, - "unable to fixup (regular) error at logical %llu on dev %s physical %llu", + btrfs_err_rl(fs_info, +"scrub: unable to fixup (regular) error at logical %llu on dev %s physical %llu", stripe->logical, btrfs_dev_name(dev), physical); } else { - btrfs_err_rl_in_rcu(fs_info, - "unable to fixup (regular) error at logical %llu on mirror %u", + btrfs_err_rl(fs_info, + "scrub: unable to fixup (regular) error at logical %llu on mirror %u", stripe->logical, stripe->mirror_num); } @@ -1593,8 +1593,7 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, physical, sctx->write_pointer); if (ret) - btrfs_err(fs_info, - "zoned: failed to recover write pointer"); + btrfs_err(fs_info, "scrub: zoned: failed to recover write pointer"); } mutex_unlock(&sctx->wr_lock); btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); @@ -1658,7 +1657,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, int ret; if (unlikely(!extent_root || !csum_root)) { - btrfs_err(fs_info, "no valid extent or csum root for scrub"); + btrfs_err(fs_info, "scrub: no valid extent or csum root found"); return -EUCLEAN; } memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * @@ -1807,7 +1806,7 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) struct btrfs_io_context *bioc = NULL; const u64 logical = stripe->logical + (i << fs_info->sectorsize_bits); - int err; + int ret; io_stripe.rst_search_commit_root = true; stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits; @@ -1815,11 +1814,11 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) * For RST cases, we need to manually split the bbio to * follow the RST boundary. */ - err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, &stripe_len, &bioc, &io_stripe, &mirror); btrfs_put_bioc(bioc); - if (err < 0) { - if (err != -ENODATA) { + if (ret < 0) { + if (ret != -ENODATA) { /* * Earlier btrfs_get_raid_extent_offset() * returned -ENODATA, which means there's @@ -1907,7 +1906,7 @@ static bool stripe_has_metadata_error(struct scrub_stripe *stripe) struct btrfs_fs_info *fs_info = stripe->bg->fs_info; btrfs_err(fs_info, - "stripe %llu has unrepaired metadata sector at %llu", + "scrub: stripe %llu has unrepaired metadata sector at logical %llu", stripe->logical, stripe->logical + (i << fs_info->sectorsize_bits)); return true; @@ -2167,7 +2166,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, bitmap_and(&error, &error, &has_extent, stripe->nr_sectors); if (!bitmap_empty(&error, stripe->nr_sectors)) { btrfs_err(fs_info, -"unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", +"scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", full_stripe_start, i, stripe->nr_sectors, &error); ret = -EIO; @@ -2789,14 +2788,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, ro_set = 0; } else if (ret == -ETXTBSY) { btrfs_warn(fs_info, - "skipping scrub of block group %llu due to active swapfile", + "scrub: skipping scrub of block group %llu due to active swapfile", cache->start); scrub_pause_off(fs_info); ret = 0; goto skip_unfreeze; } else { - btrfs_warn(fs_info, - "failed setting block group ro: %d", ret); + btrfs_warn(fs_info, "scrub: failed setting block group ro: %d", + ret); btrfs_unfreeze_block_group(cache); btrfs_put_block_group(cache); scrub_pause_off(fs_info); @@ -2892,13 +2891,13 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, ret = btrfs_check_super_csum(fs_info, sb); if (ret != 0) { btrfs_err_rl(fs_info, - "super block at physical %llu devid %llu has bad csum", + "scrub: super block at physical %llu devid %llu has bad csum", physical, dev->devid); return -EIO; } if (btrfs_super_generation(sb) != generation) { btrfs_err_rl(fs_info, -"super block at physical %llu devid %llu has bad generation %llu expect %llu", +"scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu", physical, dev->devid, btrfs_super_generation(sb), generation); return -EUCLEAN; @@ -3058,8 +3057,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (!is_dev_replace && !readonly && !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); - btrfs_err_in_rcu(fs_info, - "scrub on devid %llu: filesystem on %s is not writable", + btrfs_err(fs_info, + "scrub: devid %llu: filesystem on %s is not writable", devid, btrfs_dev_name(dev)); ret = -EROFS; goto out; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 2891ec4056c6..7664025a5af4 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4,6 +4,7 @@ */ #include <linux/bsearch.h> +#include <linux/falloc.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/sort.h> @@ -758,7 +759,7 @@ static int send_header(struct send_ctx *sctx) { struct btrfs_stream_header hdr; - strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); + strscpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); hdr.version = cpu_to_le32(sctx->proto); return write_buf(sctx->send_filp, &hdr, sizeof(hdr), &sctx->send_off); @@ -1804,7 +1805,7 @@ static int gen_unique_name(struct send_ctx *sctx, ino, gen, idx); ASSERT(len < sizeof(tmp)); tmp_name.name = tmp; - tmp_name.len = strlen(tmp); + tmp_name.len = len; di = btrfs_lookup_dir_item(NULL, sctx->send_root, path, BTRFS_FIRST_FREE_OBJECTID, @@ -1843,7 +1844,7 @@ static int gen_unique_name(struct send_ctx *sctx, break; } - ret = fs_path_add(dest, tmp, strlen(tmp)); + ret = fs_path_add(dest, tmp, len); out: btrfs_free_path(path); @@ -4628,7 +4629,6 @@ static int rbtree_ref_comp(const void *k, const struct rb_node *node) { const struct recorded_ref *data = k; const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node); - int result; if (data->dir > ref->dir) return 1; @@ -4642,12 +4642,7 @@ static int rbtree_ref_comp(const void *k, const struct rb_node *node) return 1; if (data->name_len < ref->name_len) return -1; - result = strcmp(data->name, ref->name); - if (result > 0) - return 1; - if (result < 0) - return -1; - return 0; + return strcmp(data->name, ref->name); } static bool rbtree_ref_less(struct rb_node *node, const struct rb_node *parent) @@ -5411,6 +5406,30 @@ tlv_put_failure: return ret; } +static int send_fallocate(struct send_ctx *sctx, u32 mode, u64 offset, u64 len) +{ + struct fs_path *path; + int ret; + + path = get_cur_inode_path(sctx); + if (IS_ERR(path)) + return PTR_ERR(path); + + ret = begin_cmd(sctx, BTRFS_SEND_C_FALLOCATE); + if (ret < 0) + return ret; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); + TLV_PUT_U32(sctx, BTRFS_SEND_A_FALLOCATE_MODE, mode); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len); + + ret = send_cmd(sctx); + +tlv_put_failure: + return ret; +} + static int send_hole(struct send_ctx *sctx, u64 end) { struct fs_path *p = NULL; @@ -5419,6 +5438,14 @@ static int send_hole(struct send_ctx *sctx, u64 end) int ret = 0; /* + * Starting with send stream v2 we have fallocate and can use it to + * punch holes instead of sending writes full of zeroes. + */ + if (proto_cmd_ok(sctx, BTRFS_SEND_C_FALLOCATE)) + return send_fallocate(sctx, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + offset, end - offset); + + /* * A hole that starts at EOF or beyond it. Since we do not yet support * fallocate (for extent preallocation and hole punching), sending a * write of zeroes starting at EOF or beyond would later require issuing diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index d9087aa81b21..0481c693ac2e 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -615,7 +615,7 @@ static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info, void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, - int dump_block_groups) + bool dump_block_groups) { struct btrfs_block_group *cache; u64 total_avail = 0; @@ -1887,7 +1887,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, space_info->flags, orig_bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_dump_space_info(fs_info, space_info, orig_bytes, 0); + btrfs_dump_space_info(fs_info, space_info, orig_bytes, false); } return ret; } @@ -1918,7 +1918,7 @@ int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes, trace_btrfs_space_reservation(fs_info, "space_info:enospc", space_info->flags, bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_dump_space_info(fs_info, space_info, bytes, 0); + btrfs_dump_space_info(fs_info, space_info, bytes, false); } return ret; } @@ -1973,13 +1973,13 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) static u64 calc_pct_ratio(u64 x, u64 y) { - int err; + int ret; if (!y) return 0; again: - err = check_mul_overflow(100, x, &x); - if (err) + ret = check_mul_overflow(100, x, &x); + if (ret) goto lose_precision; return div64_u64(x, y); lose_precision: @@ -2139,7 +2139,7 @@ void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool } } -bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info) +static bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info) { bool ret; diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 92b7f5e2b850..679f22efb407 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -278,7 +278,7 @@ u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, void btrfs_clear_space_info_full(struct btrfs_fs_info *info); void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, - int dump_block_groups); + bool dump_block_groups); int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 orig_bytes, @@ -306,7 +306,6 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes); void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready); -bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info); int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info); void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info); void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len); diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index d4f019233493..c9b3821957f7 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -49,7 +49,7 @@ * Implementation: * * - Common - * Both metadata and data will use a new structure, btrfs_subpage, to + * Both metadata and data will use a new structure, btrfs_folio_state, to * record the status of each sector inside a page. This provides the extra * granularity needed. * @@ -63,10 +63,10 @@ * This means a slightly higher tree locking latency. */ -int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, - struct folio *folio, enum btrfs_subpage_type type) +int btrfs_attach_folio_state(const struct btrfs_fs_info *fs_info, + struct folio *folio, enum btrfs_folio_type type) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; /* For metadata we don't support large folio yet. */ if (type == BTRFS_SUBPAGE_METADATA) @@ -87,18 +87,18 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio)) return 0; - subpage = btrfs_alloc_subpage(fs_info, folio_size(folio), type); - if (IS_ERR(subpage)) - return PTR_ERR(subpage); + bfs = btrfs_alloc_folio_state(fs_info, folio_size(folio), type); + if (IS_ERR(bfs)) + return PTR_ERR(bfs); - folio_attach_private(folio, subpage); + folio_attach_private(folio, bfs); return 0; } -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, - enum btrfs_subpage_type type) +void btrfs_detach_folio_state(const struct btrfs_fs_info *fs_info, struct folio *folio, + enum btrfs_folio_type type) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; /* Either not subpage, or the folio already has private attached. */ if (!folio_test_private(folio)) @@ -108,15 +108,15 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *fol if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio)) return; - subpage = folio_detach_private(folio); - ASSERT(subpage); - btrfs_free_subpage(subpage); + bfs = folio_detach_private(folio); + ASSERT(bfs); + btrfs_free_folio_state(bfs); } -struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, - size_t fsize, enum btrfs_subpage_type type) +struct btrfs_folio_state *btrfs_alloc_folio_state(const struct btrfs_fs_info *fs_info, + size_t fsize, enum btrfs_folio_type type) { - struct btrfs_subpage *ret; + struct btrfs_folio_state *ret; unsigned int real_size; ASSERT(fs_info->sectorsize < fsize); @@ -136,11 +136,6 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, return ret; } -void btrfs_free_subpage(struct btrfs_subpage *subpage) -{ - kfree(subpage); -} - /* * Increase the eb_refs of current subpage. * @@ -152,7 +147,7 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage) */ void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; if (!btrfs_meta_is_subpage(fs_info)) return; @@ -160,13 +155,13 @@ void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio * ASSERT(folio_test_private(folio) && folio->mapping); lockdep_assert_held(&folio->mapping->i_private_lock); - subpage = folio_get_private(folio); - atomic_inc(&subpage->eb_refs); + bfs = folio_get_private(folio); + atomic_inc(&bfs->eb_refs); } void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; if (!btrfs_meta_is_subpage(fs_info)) return; @@ -174,9 +169,9 @@ void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio * ASSERT(folio_test_private(folio) && folio->mapping); lockdep_assert_held(&folio->mapping->i_private_lock); - subpage = folio_get_private(folio); - ASSERT(atomic_read(&subpage->eb_refs)); - atomic_dec(&subpage->eb_refs); + bfs = folio_get_private(folio); + ASSERT(atomic_read(&bfs->eb_refs)); + atomic_dec(&bfs->eb_refs); } static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, @@ -191,8 +186,9 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, * unmapped page like dummy extent buffer pages. */ if (folio->mapping) - ASSERT(folio_pos(folio) <= start && - start + len <= folio_pos(folio) + folio_size(folio)); + ASSERT(folio_pos(folio) <= start && start + len <= folio_end(folio), + "start=%llu len=%u folio_pos=%llu folio_size=%zu", + start, len, folio_pos(folio), folio_size(folio)); } #define subpage_calc_start_bit(fs_info, folio, name, start, len) \ @@ -221,14 +217,13 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) if (folio_pos(folio) >= orig_start + orig_len) *len = 0; else - *len = min_t(u64, folio_pos(folio) + folio_size(folio), - orig_start + orig_len) - *start; + *len = min_t(u64, folio_end(folio), orig_start + orig_len) - *start; } static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); const int nbits = (len >> fs_info->sectorsize_bits); unsigned long flags; @@ -238,7 +233,7 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, btrfs_subpage_assert(fs_info, folio, start, len); - spin_lock_irqsave(&subpage->lock, flags); + spin_lock_irqsave(&bfs->lock, flags); /* * We have call sites passing @lock_page into * extent_clear_unlock_delalloc() for compression path. @@ -246,18 +241,18 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, * This @locked_page is locked by plain lock_page(), thus its * subpage::locked is 0. Handle them in a special way. */ - if (atomic_read(&subpage->nr_locked) == 0) { - spin_unlock_irqrestore(&subpage->lock, flags); + if (atomic_read(&bfs->nr_locked) == 0) { + spin_unlock_irqrestore(&bfs->lock, flags); return true; } - for_each_set_bit_from(bit, subpage->bitmaps, start_bit + nbits) { - clear_bit(bit, subpage->bitmaps); + for_each_set_bit_from(bit, bfs->bitmaps, start_bit + nbits) { + clear_bit(bit, bfs->bitmaps); cleared++; } - ASSERT(atomic_read(&subpage->nr_locked) >= cleared); - last = atomic_sub_and_test(cleared, &subpage->nr_locked); - spin_unlock_irqrestore(&subpage->lock, flags); + ASSERT(atomic_read(&bfs->nr_locked) >= cleared); + last = atomic_sub_and_test(cleared, &bfs->nr_locked); + spin_unlock_irqrestore(&bfs->lock, flags); return last; } @@ -280,7 +275,7 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); ASSERT(folio_test_locked(folio)); @@ -296,7 +291,7 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, * Since we own the page lock, no one else could touch subpage::locked * and we are safe to do several atomic operations without spinlock. */ - if (atomic_read(&subpage->nr_locked) == 0) { + if (atomic_read(&bfs->nr_locked) == 0) { /* No subpage lock, locked by plain lock_page(). */ folio_unlock(folio); return; @@ -310,7 +305,7 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long bitmap) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); const int start_bit = blocks_per_folio * btrfs_bitmap_nr_locked; unsigned long flags; @@ -323,42 +318,42 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, return; } - if (atomic_read(&subpage->nr_locked) == 0) { + if (atomic_read(&bfs->nr_locked) == 0) { /* No subpage lock, locked by plain lock_page(). */ folio_unlock(folio); return; } - spin_lock_irqsave(&subpage->lock, flags); + spin_lock_irqsave(&bfs->lock, flags); for_each_set_bit(bit, &bitmap, blocks_per_folio) { - if (test_and_clear_bit(bit + start_bit, subpage->bitmaps)) + if (test_and_clear_bit(bit + start_bit, bfs->bitmaps)) cleared++; } - ASSERT(atomic_read(&subpage->nr_locked) >= cleared); - last = atomic_sub_and_test(cleared, &subpage->nr_locked); - spin_unlock_irqrestore(&subpage->lock, flags); + ASSERT(atomic_read(&bfs->nr_locked) >= cleared); + last = atomic_sub_and_test(cleared, &bfs->nr_locked); + spin_unlock_irqrestore(&bfs->lock, flags); if (last) folio_unlock(folio); } #define subpage_test_bitmap_all_set(fs_info, folio, name) \ ({ \ - struct btrfs_subpage *subpage = folio_get_private(folio); \ + struct btrfs_folio_state *bfs = folio_get_private(folio); \ const unsigned int blocks_per_folio = \ btrfs_blocks_per_folio(fs_info, folio); \ \ - bitmap_test_range_all_set(subpage->bitmaps, \ + bitmap_test_range_all_set(bfs->bitmaps, \ blocks_per_folio * btrfs_bitmap_nr_##name, \ blocks_per_folio); \ }) #define subpage_test_bitmap_all_zero(fs_info, folio, name) \ ({ \ - struct btrfs_subpage *subpage = folio_get_private(folio); \ + struct btrfs_folio_state *bfs = folio_get_private(folio); \ const unsigned int blocks_per_folio = \ btrfs_blocks_per_folio(fs_info, folio); \ \ - bitmap_test_range_all_zero(subpage->bitmaps, \ + bitmap_test_range_all_zero(bfs->bitmaps, \ blocks_per_folio * btrfs_bitmap_nr_##name, \ blocks_per_folio); \ }) @@ -366,43 +361,43 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, uptodate, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_set(fs_info, folio, uptodate)) folio_mark_uptodate(folio); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, uptodate, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); folio_clear_uptodate(folio); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_unlock_irqrestore(&bfs->lock, flags); folio_mark_dirty(folio); } @@ -419,17 +414,17 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len); unsigned long flags; bool last = false; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_zero(fs_info, folio, dirty)) last = true; - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); return last; } @@ -446,91 +441,91 @@ void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info, void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, writeback, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (!folio_test_writeback(folio)) folio_start_writeback(folio); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, writeback, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_zero(fs_info, folio, writeback)) { ASSERT(folio_test_writeback(folio)); folio_end_writeback(folio); } - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, ordered, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); folio_set_ordered(folio); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, ordered, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_zero(fs_info, folio, ordered)) folio_clear_ordered(folio); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, checked, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_set(fs_info, folio, checked)) folio_set_checked(folio); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct btrfs_folio_state *bfs = folio_get_private(folio); unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, checked, start, len); unsigned long flags; - spin_lock_irqsave(&subpage->lock, flags); - bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + spin_lock_irqsave(&bfs->lock, flags); + bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); folio_clear_checked(folio); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } /* @@ -541,16 +536,16 @@ void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info, bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ - struct btrfs_subpage *subpage = folio_get_private(folio); \ + struct btrfs_folio_state *bfs = folio_get_private(folio); \ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, \ name, start, len); \ unsigned long flags; \ bool ret; \ \ - spin_lock_irqsave(&subpage->lock, flags); \ - ret = bitmap_test_range_all_set(subpage->bitmaps, start_bit, \ + spin_lock_irqsave(&bfs->lock, flags); \ + ret = bitmap_test_range_all_set(bfs->bitmaps, start_bit, \ len >> fs_info->sectorsize_bits); \ - spin_unlock_irqrestore(&subpage->lock, flags); \ + spin_unlock_irqrestore(&bfs->lock, flags); \ return ret; \ } IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); @@ -662,10 +657,10 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, { \ const unsigned int blocks_per_folio = \ btrfs_blocks_per_folio(fs_info, folio); \ - const struct btrfs_subpage *subpage = folio_get_private(folio); \ + const struct btrfs_folio_state *bfs = folio_get_private(folio); \ \ ASSERT(blocks_per_folio <= BITS_PER_LONG); \ - *dst = bitmap_read(subpage->bitmaps, \ + *dst = bitmap_read(bfs->bitmaps, \ blocks_per_folio * btrfs_bitmap_nr_##name, \ blocks_per_folio); \ } @@ -690,7 +685,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; unsigned int start_bit; unsigned int nbits; unsigned long flags; @@ -705,15 +700,15 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len); nbits = len >> fs_info->sectorsize_bits; - subpage = folio_get_private(folio); - ASSERT(subpage); - spin_lock_irqsave(&subpage->lock, flags); - if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) { + bfs = folio_get_private(folio); + ASSERT(bfs); + spin_lock_irqsave(&bfs->lock, flags); + if (unlikely(!bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits))) { SUBPAGE_DUMP_BITMAP(fs_info, folio, dirty, start, len); - ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); + ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits)); } - ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); - spin_unlock_irqrestore(&subpage->lock, flags); + ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits)); + spin_unlock_irqrestore(&bfs->lock, flags); } /* @@ -726,7 +721,7 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; unsigned long flags; unsigned int start_bit; unsigned int nbits; @@ -736,19 +731,19 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio)) return; - subpage = folio_get_private(folio); + bfs = folio_get_private(folio); start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); nbits = len >> fs_info->sectorsize_bits; - spin_lock_irqsave(&subpage->lock, flags); + spin_lock_irqsave(&bfs->lock, flags); /* Target range should not yet be locked. */ - if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) { + if (unlikely(!bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits))) { SUBPAGE_DUMP_BITMAP(fs_info, folio, locked, start, len); - ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); + ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits)); } - bitmap_set(subpage->bitmaps, start_bit, nbits); - ret = atomic_add_return(nbits, &subpage->nr_locked); + bitmap_set(bfs->bitmaps, start_bit, nbits); + ret = atomic_add_return(nbits, &bfs->nr_locked); ASSERT(ret <= btrfs_blocks_per_folio(fs_info, folio)); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } /* @@ -776,7 +771,7 @@ bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct ext void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); unsigned long uptodate_bitmap; unsigned long dirty_bitmap; @@ -788,18 +783,18 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(blocks_per_folio > 1); - subpage = folio_get_private(folio); + bfs = folio_get_private(folio); - spin_lock_irqsave(&subpage->lock, flags); + spin_lock_irqsave(&bfs->lock, flags); GET_SUBPAGE_BITMAP(fs_info, folio, uptodate, &uptodate_bitmap); GET_SUBPAGE_BITMAP(fs_info, folio, dirty, &dirty_bitmap); GET_SUBPAGE_BITMAP(fs_info, folio, writeback, &writeback_bitmap); GET_SUBPAGE_BITMAP(fs_info, folio, ordered, &ordered_bitmap); GET_SUBPAGE_BITMAP(fs_info, folio, checked, &checked_bitmap); GET_SUBPAGE_BITMAP(fs_info, folio, locked, &locked_bitmap); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); - dump_page(folio_page(folio, 0), "btrfs subpage dump"); + dump_page(folio_page(folio, 0), "btrfs folio state dump"); btrfs_warn(fs_info, "start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", start, len, folio_pos(folio), @@ -815,14 +810,14 @@ void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long *ret_bitmap) { - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(btrfs_blocks_per_folio(fs_info, folio) > 1); - subpage = folio_get_private(folio); + bfs = folio_get_private(folio); - spin_lock_irqsave(&subpage->lock, flags); + spin_lock_irqsave(&bfs->lock, flags); GET_SUBPAGE_BITMAP(fs_info, folio, dirty, ret_bitmap); - spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock_irqrestore(&bfs->lock, flags); } diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 3042c5ea840a..ee0710eb13fd 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -32,9 +32,31 @@ struct folio; enum { btrfs_bitmap_nr_uptodate = 0, btrfs_bitmap_nr_dirty, + + /* + * This can be changed to atomic eventually. But this change will rely + * on the async delalloc range rework for locked bitmap. As async + * delalloc can unlock its range and mark blocks writeback at random + * timing. + */ btrfs_bitmap_nr_writeback, + + /* + * The ordered and checked flags are for COW fixup, already marked + * deprecated, and will be removed eventually. + */ btrfs_bitmap_nr_ordered, btrfs_bitmap_nr_checked, + + /* + * The locked bit is for async delalloc range (compression), currently + * async extent is queued with the range locked, until the compression + * is done. + * So an async extent can unlock the range at any random timing. + * + * This will need a rework on the async extent lifespan (mark writeback + * and do compression) before deprecating this flag. + */ btrfs_bitmap_nr_locked, btrfs_bitmap_nr_max }; @@ -43,7 +65,7 @@ enum { * Structure to trace status of each sector inside a page, attached to * page::private for both data and metadata inodes. */ -struct btrfs_subpage { +struct btrfs_folio_state { /* Common members for both data and metadata pages */ spinlock_t lock; union { @@ -51,7 +73,7 @@ struct btrfs_subpage { * Structures only used by metadata * * @eb_refs should only be operated under private_lock, as it - * manages whether the subpage can be detached. + * manages whether the btrfs_folio_state can be detached. */ atomic_t eb_refs; @@ -65,12 +87,11 @@ struct btrfs_subpage { unsigned long bitmaps[]; }; -enum btrfs_subpage_type { +enum btrfs_folio_type { BTRFS_SUBPAGE_METADATA, BTRFS_SUBPAGE_DATA, }; -#if PAGE_SIZE > BTRFS_MIN_BLOCKSIZE /* * Subpage support for metadata is more complex, as we can have dummy extent * buffers, where folios have no mapping to determine the owning inode. @@ -91,29 +112,19 @@ static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, ASSERT(is_data_inode(BTRFS_I(folio->mapping->host))); return fs_info->sectorsize < folio_size(folio); } -#else -static inline bool btrfs_meta_is_subpage(const struct btrfs_fs_info *fs_info) -{ - return false; -} -static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, - struct folio *folio) -{ - if (folio->mapping && folio->mapping->host) - ASSERT(is_data_inode(BTRFS_I(folio->mapping->host))); - return false; -} -#endif -int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, - struct folio *folio, enum btrfs_subpage_type type); -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, - enum btrfs_subpage_type type); +int btrfs_attach_folio_state(const struct btrfs_fs_info *fs_info, + struct folio *folio, enum btrfs_folio_type type); +void btrfs_detach_folio_state(const struct btrfs_fs_info *fs_info, struct folio *folio, + enum btrfs_folio_type type); /* Allocate additional data where page represents more than one sector */ -struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, - size_t fsize, enum btrfs_subpage_type type); -void btrfs_free_subpage(struct btrfs_subpage *subpage); +struct btrfs_folio_state *btrfs_alloc_folio_state(const struct btrfs_fs_info *fs_info, + size_t fsize, enum btrfs_folio_type type); +static inline void btrfs_free_folio_state(struct btrfs_folio_state *bfs) +{ + kfree(bfs); +} void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a0c65adce1ab..68e35a3700ff 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -261,10 +261,65 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = { {} }; -/* No support for restricting writes to btrfs devices yet... */ -static inline blk_mode_t btrfs_open_mode(struct fs_context *fc) +static bool btrfs_match_compress_type(const char *string, const char *type, bool may_have_level) { - return sb_open_mode(fc->sb_flags) & ~BLK_OPEN_RESTRICT_WRITES; + const int len = strlen(type); + + return (strncmp(string, type, len) == 0) && + ((may_have_level && string[len] == ':') || string[len] == '\0'); +} + +static int btrfs_parse_compress(struct btrfs_fs_context *ctx, + const struct fs_parameter *param, int opt) +{ + const char *string = param->string; + + /* + * Provide the same semantics as older kernels that don't use fs + * context, specifying the "compress" option clears "force-compress" + * without the need to pass "compress-force=[no|none]" before + * specifying "compress". + */ + if (opt != Opt_compress_force && opt != Opt_compress_force_type) + btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); + + if (opt == Opt_compress || opt == Opt_compress_force) { + ctx->compress_type = BTRFS_COMPRESS_ZLIB; + ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (btrfs_match_compress_type(string, "zlib", true)) { + ctx->compress_type = BTRFS_COMPRESS_ZLIB; + ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, + string + 4); + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (btrfs_match_compress_type(string, "lzo", false)) { + ctx->compress_type = BTRFS_COMPRESS_LZO; + ctx->compress_level = 0; + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (btrfs_match_compress_type(string, "zstd", true)) { + ctx->compress_type = BTRFS_COMPRESS_ZSTD; + ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, + string + 4); + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (btrfs_match_compress_type(string, "no", false) || + btrfs_match_compress_type(string, "none", false)) { + ctx->compress_level = 0; + ctx->compress_type = 0; + btrfs_clear_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); + } else { + btrfs_err(NULL, "unrecognized compression value %s", string); + return -EINVAL; + } + return 0; } static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) @@ -303,10 +358,9 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) break; case Opt_device: { struct btrfs_device *device; - blk_mode_t mode = btrfs_open_mode(fc); mutex_lock(&uuid_mutex); - device = btrfs_scan_one_device(param->string, mode, false); + device = btrfs_scan_one_device(param->string, false); mutex_unlock(&uuid_mutex); if (IS_ERR(device)) return PTR_ERR(device); @@ -336,53 +390,8 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) fallthrough; case Opt_compress: case Opt_compress_type: - /* - * Provide the same semantics as older kernels that don't use fs - * context, specifying the "compress" option clears - * "force-compress" without the need to pass - * "compress-force=[no|none]" before specifying "compress". - */ - if (opt != Opt_compress_force && opt != Opt_compress_force_type) - btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); - - if (opt == Opt_compress || opt == Opt_compress_force) { - ctx->compress_type = BTRFS_COMPRESS_ZLIB; - ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; - btrfs_set_opt(ctx->mount_opt, COMPRESS); - btrfs_clear_opt(ctx->mount_opt, NODATACOW); - btrfs_clear_opt(ctx->mount_opt, NODATASUM); - } else if (strncmp(param->string, "zlib", 4) == 0) { - ctx->compress_type = BTRFS_COMPRESS_ZLIB; - ctx->compress_level = - btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, - param->string + 4); - btrfs_set_opt(ctx->mount_opt, COMPRESS); - btrfs_clear_opt(ctx->mount_opt, NODATACOW); - btrfs_clear_opt(ctx->mount_opt, NODATASUM); - } else if (strncmp(param->string, "lzo", 3) == 0) { - ctx->compress_type = BTRFS_COMPRESS_LZO; - ctx->compress_level = 0; - btrfs_set_opt(ctx->mount_opt, COMPRESS); - btrfs_clear_opt(ctx->mount_opt, NODATACOW); - btrfs_clear_opt(ctx->mount_opt, NODATASUM); - } else if (strncmp(param->string, "zstd", 4) == 0) { - ctx->compress_type = BTRFS_COMPRESS_ZSTD; - ctx->compress_level = - btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, - param->string + 4); - btrfs_set_opt(ctx->mount_opt, COMPRESS); - btrfs_clear_opt(ctx->mount_opt, NODATACOW); - btrfs_clear_opt(ctx->mount_opt, NODATASUM); - } else if (strncmp(param->string, "no", 2) == 0) { - ctx->compress_level = 0; - ctx->compress_type = 0; - btrfs_clear_opt(ctx->mount_opt, COMPRESS); - btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); - } else { - btrfs_err(NULL, "unrecognized compression value %s", - param->string); + if (btrfs_parse_compress(ctx, param, opt)) return -EINVAL; - } break; case Opt_ssd: if (result.negated) { @@ -945,12 +954,12 @@ static int btrfs_fill_super(struct super_block *sb, { struct btrfs_inode *inode; struct btrfs_fs_info *fs_info = btrfs_sb(sb); - int err; + int ret; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_magic = BTRFS_SUPER_MAGIC; sb->s_op = &btrfs_super_ops; - sb->s_d_op = &btrfs_dentry_operations; + set_default_d_op(sb, &btrfs_dentry_operations); sb->s_export_op = &btrfs_export_ops; #ifdef CONFIG_FS_VERITY sb->s_vop = &btrfs_verityops; @@ -959,28 +968,28 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_time_gran = 1; sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM; - err = super_setup_bdi(sb); - if (err) { + ret = super_setup_bdi(sb); + if (ret) { btrfs_err(fs_info, "super_setup_bdi failed"); - return err; + return ret; } - err = open_ctree(sb, fs_devices); - if (err) { - btrfs_err(fs_info, "open_ctree failed: %d", err); - return err; + ret = open_ctree(sb, fs_devices); + if (ret) { + btrfs_err(fs_info, "open_ctree failed: %d", ret); + return ret; } inode = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root); if (IS_ERR(inode)) { - err = PTR_ERR(inode); - btrfs_handle_fs_error(fs_info, err, NULL); + ret = PTR_ERR(inode); + btrfs_handle_fs_error(fs_info, ret, NULL); goto fail_close; } sb->s_root = d_make_root(&inode->vfs_inode); if (!sb->s_root) { - err = -ENOMEM; + ret = -ENOMEM; goto fail_close; } @@ -989,7 +998,7 @@ static int btrfs_fill_super(struct super_block *sb, fail_close: close_ctree(fs_info); - return err; + return ret; } int btrfs_sync_fs(struct super_block *sb, int wait) @@ -1826,10 +1835,9 @@ static int btrfs_get_tree_super(struct fs_context *fc) struct btrfs_fs_info *fs_info = fc->s_fs_info; struct btrfs_fs_context *ctx = fc->fs_private; struct btrfs_fs_devices *fs_devices = NULL; - struct block_device *bdev; struct btrfs_device *device; struct super_block *sb; - blk_mode_t mode = btrfs_open_mode(fc); + blk_mode_t mode = sb_open_mode(fc->sb_flags); int ret; btrfs_ctx_to_info(fs_info, ctx); @@ -1839,47 +1847,60 @@ static int btrfs_get_tree_super(struct fs_context *fc) * With 'true' passed to btrfs_scan_one_device() (mount time) we expect * either a valid device or an error. */ - device = btrfs_scan_one_device(fc->source, mode, true); + device = btrfs_scan_one_device(fc->source, true); ASSERT(device != NULL); if (IS_ERR(device)) { mutex_unlock(&uuid_mutex); return PTR_ERR(device); } - fs_devices = device->fs_devices; + /* + * We cannot hold uuid_mutex calling sget_fc(), it will lead to a + * locking order reversal with s_umount. + * + * So here we increase the holding number of fs_devices, this will ensure + * the fs_devices itself won't be freed. + */ + btrfs_fs_devices_inc_holding(fs_devices); fs_info->fs_devices = fs_devices; - - ret = btrfs_open_devices(fs_devices, mode, &btrfs_fs_type); mutex_unlock(&uuid_mutex); - if (ret) - return ret; - - if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) { - ret = -EACCES; - goto error; - } - bdev = fs_devices->latest_dev->bdev; - /* - * From now on the error handling is not straightforward. - * - * If successful, this will transfer the fs_info into the super block, - * and fc->s_fs_info will be NULL. However if there's an existing - * super, we'll still have fc->s_fs_info populated. If we error - * completely out it'll be cleaned up when we drop the fs_context, - * otherwise it's tied to the lifetime of the super_block. - */ sb = sget_fc(fc, btrfs_fc_test_super, set_anon_super_fc); if (IS_ERR(sb)) { - ret = PTR_ERR(sb); - goto error; + mutex_lock(&uuid_mutex); + btrfs_fs_devices_dec_holding(fs_devices); + /* + * Since the fs_devices is not opened, it can be freed at any + * time after unlocking uuid_mutex. We need to avoid double + * free through put_fs_context()->btrfs_free_fs_info(). + * So here we reset fs_info->fs_devices to NULL, and let the + * regular fs_devices reclaim path to handle it. + * + * This applies to all later branches where no fs_devices is + * opened. + */ + fs_info->fs_devices = NULL; + mutex_unlock(&uuid_mutex); + return PTR_ERR(sb); } set_device_specific_options(fs_info); if (sb->s_root) { - btrfs_close_devices(fs_devices); + /* + * Not the first mount of the fs thus got an existing super block. + * Will reuse the returned super block, fs_info and fs_devices. + * + * fc->s_fs_info is not touched and will be later freed by + * put_fs_context() through btrfs_free_fs_context(). + */ + ASSERT(fc->s_fs_info == fs_info); + + mutex_lock(&uuid_mutex); + btrfs_fs_devices_dec_holding(fs_devices); + fs_info->fs_devices = NULL; + mutex_unlock(&uuid_mutex); /* * At this stage we may have RO flag mismatch between * fc->sb_flags and sb->s_flags. Caller should detect such @@ -1887,9 +1908,32 @@ static int btrfs_get_tree_super(struct fs_context *fc) * needed. */ } else { + struct block_device *bdev; + + /* + * The first mount of the fs thus a new superblock, fc->s_fs_info + * must be NULL, and the ownership of our fs_info and fs_devices is + * transferred to the super block. + */ + ASSERT(fc->s_fs_info == NULL); + + mutex_lock(&uuid_mutex); + btrfs_fs_devices_dec_holding(fs_devices); + ret = btrfs_open_devices(fs_devices, mode, sb); + if (ret < 0) + fs_info->fs_devices = NULL; + mutex_unlock(&uuid_mutex); + if (ret < 0) { + deactivate_locked_super(sb); + return ret; + } + if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) { + deactivate_locked_super(sb); + return -EACCES; + } + bdev = fs_devices->latest_dev->bdev; snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id); - btrfs_sb(sb)->bdev_holder = &btrfs_fs_type; ret = btrfs_fill_super(sb, fs_devices); if (ret) { deactivate_locked_super(sb); @@ -1901,10 +1945,6 @@ static int btrfs_get_tree_super(struct fs_context *fc) fc->root = dget(sb->s_root); return 0; - -error: - btrfs_close_devices(fs_devices); - return ret; } /* @@ -1980,17 +2020,13 @@ error: * btrfs or not, setting the whole super block RO. To make per-subvolume mounting * work with different options work we need to keep backward compatibility. */ -static int btrfs_reconfigure_for_mount(struct fs_context *fc, struct vfsmount *mnt) +static int btrfs_reconfigure_for_mount(struct fs_context *fc) { int ret = 0; - if (fc->sb_flags & SB_RDONLY) - return ret; - - down_write(&mnt->mnt_sb->s_umount); - if (!(fc->sb_flags & SB_RDONLY) && (mnt->mnt_sb->s_flags & SB_RDONLY)) + if (!(fc->sb_flags & SB_RDONLY) && (fc->root->d_sb->s_flags & SB_RDONLY)) ret = btrfs_reconfigure(fc); - up_write(&mnt->mnt_sb->s_umount); + return ret; } @@ -2035,25 +2071,18 @@ static int btrfs_get_tree_subvol(struct fs_context *fc) */ dup_fc->s_fs_info = fs_info; - /* - * We'll do the security settings in our btrfs_get_tree_super() mount - * loop, they were duplicated into dup_fc, we can drop the originals - * here. - */ - security_free_mnt_opts(&fc->security); - fc->security = NULL; + ret = btrfs_get_tree_super(dup_fc); + if (ret) + goto error; - mnt = fc_mount(dup_fc); - if (IS_ERR(mnt)) { - put_fs_context(dup_fc); - return PTR_ERR(mnt); - } - ret = btrfs_reconfigure_for_mount(dup_fc, mnt); + ret = btrfs_reconfigure_for_mount(dup_fc); + up_write(&dup_fc->root->d_sb->s_umount); + if (ret) + goto error; + mnt = vfs_create_mount(dup_fc); put_fs_context(dup_fc); - if (ret) { - mntput(mnt); - return ret; - } + if (IS_ERR(mnt)) + return PTR_ERR(mnt); /* * This free's ->subvol_name, because if it isn't set we have to @@ -2067,25 +2096,15 @@ static int btrfs_get_tree_subvol(struct fs_context *fc) fc->root = dentry; return 0; +error: + put_fs_context(dup_fc); + return ret; } static int btrfs_get_tree(struct fs_context *fc) { - /* - * Since we use mount_subtree to mount the default/specified subvol, we - * have to do mounts in two steps. - * - * First pass through we call btrfs_get_tree_subvol(), this is just a - * wrapper around fc_mount() to call back into here again, and this time - * we'll call btrfs_get_tree_super(). This will do the open_ctree() and - * everything to open the devices and file system. Then we return back - * with a fully constructed vfsmount in btrfs_get_tree_subvol(), and - * from there we can do our mount_subvol() call, which will lookup - * whichever subvol we're mounting and setup this fc with the - * appropriate dentry for the subvol. - */ - if (fc->s_fs_info) - return btrfs_get_tree_super(fc); + ASSERT(fc->s_fs_info == NULL); + return btrfs_get_tree_subvol(fc); } @@ -2217,7 +2236,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, * Scanning outside of mount can return NULL which would turn * into 0 error code. */ - device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false); + device = btrfs_scan_one_device(vol->name, false); ret = PTR_ERR_OR_ZERO(device); mutex_unlock(&uuid_mutex); break; @@ -2235,7 +2254,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, * Scanning outside of mount can return NULL which would turn * into 0 error code. */ - device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false); + device = btrfs_scan_one_device(vol->name, false); if (IS_ERR_OR_NULL(device)) { mutex_unlock(&uuid_mutex); if (IS_ERR(device)) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 5d93d9dd2c12..9d398f7a36ad 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -160,8 +160,7 @@ static int can_modify_feature(struct btrfs_feature_attr *fa) clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR; break; default: - pr_warn("btrfs: sysfs: unknown feature set %d\n", - fa->feature_set); + btrfs_warn(NULL, "sysfs: unknown feature set %d", fa->feature_set); return 0; } @@ -1138,13 +1137,21 @@ static ssize_t btrfs_commit_stats_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); + u64 now = ktime_get_ns(); + u64 start_time = fs_info->commit_stats.critical_section_start_time; + u64 pending = 0; + + if (start_time) + pending = now - start_time; return sysfs_emit(buf, "commits %llu\n" + "cur_commit_ms %llu\n" "last_commit_ms %llu\n" "max_commit_ms %llu\n" "total_commit_ms %llu\n", fs_info->commit_stats.commit_count, + div_u64(pending, NSEC_PER_MSEC), div_u64(fs_info->commit_stats.last_commit_dur, NSEC_PER_MSEC), div_u64(fs_info->commit_stats.max_commit_dur, NSEC_PER_MSEC), div_u64(fs_info->commit_stats.total_commit_dur, NSEC_PER_MSEC)); @@ -1202,7 +1209,7 @@ static ssize_t quota_override_store(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); unsigned long knob; - int err; + int ret; if (!fs_info) return -EPERM; @@ -1210,9 +1217,9 @@ static ssize_t quota_override_store(struct kobject *kobj, if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - err = kstrtoul(buf, 10, &knob); - if (err) - return err; + ret = kstrtoul(buf, 10, &knob); + if (ret) + return ret; if (knob > 1) return -EINVAL; @@ -2239,7 +2246,7 @@ void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); if (ret) - pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", + btrfs_warn(NULL, "sending event %d to kobject: '%s' (%p): failed", action, kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), &disk_to_dev(bdev->bd_disk)->kobj); } @@ -2282,15 +2289,15 @@ static struct kset *btrfs_kset; */ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs) { - int error; + int ret; init_completion(&fs_devs->kobj_unregister); fs_devs->fsid_kobj.kset = btrfs_kset; - error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL, - "%pU", fs_devs->fsid); - if (error) { + ret = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL, + "%pU", fs_devs->fsid); + if (ret) { kobject_put(&fs_devs->fsid_kobj); - return error; + return ret; } fs_devs->devices_kobj = kobject_create_and_add("devices", @@ -2316,71 +2323,70 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs) int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) { - int error; + int ret; struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; struct kobject *fsid_kobj = &fs_devs->fsid_kobj; - error = btrfs_sysfs_add_fs_devices(fs_devs); - if (error) - return error; + ret = btrfs_sysfs_add_fs_devices(fs_devs); + if (ret) + return ret; - error = sysfs_create_files(fsid_kobj, btrfs_attrs); - if (error) { + ret = sysfs_create_files(fsid_kobj, btrfs_attrs); + if (ret) { btrfs_sysfs_remove_fs_devices(fs_devs); - return error; + return ret; } - error = sysfs_create_group(fsid_kobj, - &btrfs_feature_attr_group); - if (error) + ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group); + if (ret) goto failure; #ifdef CONFIG_BTRFS_DEBUG fs_info->debug_kobj = kobject_create_and_add("debug", fsid_kobj); if (!fs_info->debug_kobj) { - error = -ENOMEM; + ret = -ENOMEM; goto failure; } - error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); - if (error) + ret = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); + if (ret) goto failure; #endif /* Discard directory */ fs_info->discard_kobj = kobject_create_and_add("discard", fsid_kobj); if (!fs_info->discard_kobj) { - error = -ENOMEM; + ret = -ENOMEM; goto failure; } - error = sysfs_create_files(fs_info->discard_kobj, discard_attrs); - if (error) + ret = sysfs_create_files(fs_info->discard_kobj, discard_attrs); + if (ret) goto failure; - error = addrm_unknown_feature_attrs(fs_info, true); - if (error) + ret = addrm_unknown_feature_attrs(fs_info, true); + if (ret) goto failure; - error = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi"); - if (error) + ret = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi"); + if (ret) goto failure; fs_info->space_info_kobj = kobject_create_and_add("allocation", fsid_kobj); if (!fs_info->space_info_kobj) { - error = -ENOMEM; + ret = -ENOMEM; goto failure; } - error = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs); - if (error) + ret = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs); + if (ret) goto failure; return 0; failure: btrfs_sysfs_remove_mounted(fs_info); - return error; + return ret; } static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj, diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 00da54f0164c..b19328d077d3 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -23,8 +23,8 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, { int ret; struct folio_batch fbatch; - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; + pgoff_t index = start >> PAGE_SHIFT; + pgoff_t end_index = end >> PAGE_SHIFT; int i; int count = 0; int loops = 0; @@ -75,7 +75,8 @@ static void extent_flag_to_str(const struct extent_state *state, char *dest) dest[0] = 0; PRINT_ONE_FLAG(state, dest, cur, DIRTY); PRINT_ONE_FLAG(state, dest, cur, LOCKED); - PRINT_ONE_FLAG(state, dest, cur, NEW); + PRINT_ONE_FLAG(state, dest, cur, DIRTY_LOG1); + PRINT_ONE_FLAG(state, dest, cur, DIRTY_LOG2); PRINT_ONE_FLAG(state, dest, cur, DELALLOC); PRINT_ONE_FLAG(state, dest, cur, DEFRAG); PRINT_ONE_FLAG(state, dest, cur, BOUNDARY); @@ -113,7 +114,6 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) struct extent_io_tree *tmp; struct page *page; struct page *locked_page = NULL; - unsigned long index = 0; /* In this test we need at least 2 file extents at its maximum size */ u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; u64 total_dirty = 2 * max_bytes; @@ -156,7 +156,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) * everything to make sure our pages don't get evicted and screw up our * test. */ - for (index = 0; index < (total_dirty >> PAGE_SHIFT); index++) { + for (pgoff_t index = 0; index < (total_dirty >> PAGE_SHIFT); index++) { page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL); if (!page) { test_err("failed to allocate test page"); @@ -326,7 +326,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) out_bits: if (ret) dump_extent_io_tree(tmp); - btrfs_clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1); + btrfs_clear_extent_bit(tmp, 0, total_dirty - 1, (unsigned)-1, NULL); out: if (locked_page) put_page(locked_page); @@ -343,11 +343,11 @@ static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb) unsigned long i; for (i = 0; i < eb->len * BITS_PER_BYTE; i++) { - int bit, bit1; + bool bit_set, bit1_set; - bit = !!test_bit(i, bitmap); - bit1 = !!extent_buffer_test_bit(eb, 0, i); - if (bit1 != bit) { + bit_set = test_bit(i, bitmap); + bit1_set = extent_buffer_test_bit(eb, 0, i); + if (bit1_set != bit_set) { u8 has; u8 expect; @@ -360,9 +360,9 @@ static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb) return -EINVAL; } - bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE, - i % BITS_PER_BYTE); - if (bit1 != bit) { + bit1_set = extent_buffer_test_bit(eb, i / BITS_PER_BYTE, + i % BITS_PER_BYTE); + if (bit1_set != bit_set) { u8 has; u8 expect; @@ -662,7 +662,7 @@ static int test_find_first_clear_extent_bit(void) out: if (ret) dump_extent_io_tree(&tree); - btrfs_clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_clear_extent_bit(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); return ret; } diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index b61972046feb..c8822edd32e2 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -32,7 +32,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans, unsigned int i; int ret; - info = search_free_space_info(trans, cache, path, 0); + info = btrfs_search_free_space_info(trans, cache, path, 0); if (IS_ERR(info)) { test_err("could not find free space info"); ret = PTR_ERR(info); @@ -57,7 +57,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans, goto invalid; offset = key.objectid; while (offset < key.objectid + key.offset) { - bit = free_space_test_bit(cache, path, offset); + bit = btrfs_free_space_test_bit(cache, path, offset); if (prev_bit == 0 && bit == 1) { extent_start = offset; } else if (prev_bit == 1 && bit == 0) { @@ -115,7 +115,7 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans, u32 flags; int ret; - info = search_free_space_info(trans, cache, path, 0); + info = btrfs_search_free_space_info(trans, cache, path, 0); if (IS_ERR(info)) { test_err("could not find free space info"); btrfs_release_path(path); @@ -131,13 +131,13 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans, /* Flip it to the other format and check that for good measure. */ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { - ret = convert_free_space_to_extents(trans, cache, path); + ret = btrfs_convert_free_space_to_extents(trans, cache, path); if (ret) { test_err("could not convert to extents"); return ret; } } else { - ret = convert_free_space_to_bitmaps(trans, cache, path); + ret = btrfs_convert_free_space_to_bitmaps(trans, cache, path); if (ret) { test_err("could not convert to bitmaps"); return ret; @@ -170,9 +170,8 @@ static int test_remove_all(struct btrfs_trans_handle *trans, const struct free_space_extent extents[] = {}; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, - cache->start, - cache->length); + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, + cache->start, cache->length); if (ret) { test_err("could not remove free space"); return ret; @@ -193,8 +192,8 @@ static int test_remove_beginning(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, - cache->start, alignment); + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, + cache->start, alignment); if (ret) { test_err("could not remove free space"); return ret; @@ -216,7 +215,7 @@ static int test_remove_end(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, cache->start + cache->length - alignment, alignment); if (ret) { @@ -240,9 +239,9 @@ static int test_remove_middle(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, - cache->start + alignment, - alignment); + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, + cache->start + alignment, + alignment); if (ret) { test_err("could not remove free space"); return ret; @@ -263,23 +262,22 @@ static int test_merge_left(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, - cache->start, cache->length); + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, + cache->start, cache->length); if (ret) { test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, cache->start, - alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, cache->start, + alignment); if (ret) { test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, - cache->start + alignment, - alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, + cache->start + alignment, alignment); if (ret) { test_err("could not add free space"); return ret; @@ -300,24 +298,23 @@ static int test_merge_right(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, - cache->start, cache->length); + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, + cache->start, cache->length); if (ret) { test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, - cache->start + 2 * alignment, - alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, + cache->start + 2 * alignment, + alignment); if (ret) { test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, - cache->start + alignment, - alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, + cache->start + alignment, alignment); if (ret) { test_err("could not add free space"); return ret; @@ -338,29 +335,29 @@ static int test_merge_both(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, - cache->start, cache->length); + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, + cache->start, cache->length); if (ret) { test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, cache->start, - alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, cache->start, + alignment); if (ret) { test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, - cache->start + 2 * alignment, alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, + cache->start + 2 * alignment, alignment); if (ret) { test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, - cache->start + alignment, alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, + cache->start + alignment, alignment); if (ret) { test_err("could not add free space"); return ret; @@ -383,29 +380,29 @@ static int test_merge_none(struct btrfs_trans_handle *trans, }; int ret; - ret = __remove_from_free_space_tree(trans, cache, path, - cache->start, cache->length); + ret = __btrfs_remove_from_free_space_tree(trans, cache, path, + cache->start, cache->length); if (ret) { test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, cache->start, - alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, cache->start, + alignment); if (ret) { test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, - cache->start + 4 * alignment, alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, + cache->start + 4 * alignment, alignment); if (ret) { test_err("could not add free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, - cache->start + 2 * alignment, alignment); + ret = __btrfs_add_to_free_space_tree(trans, cache, path, + cache->start + 2 * alignment, alignment); if (ret) { test_err("could not add free space"); return ret; @@ -483,14 +480,14 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, goto out; } - ret = add_block_group_free_space(&trans, cache); + ret = btrfs_add_block_group_free_space(&trans, cache); if (ret) { test_err("could not add block group free space"); goto out; } if (bitmaps) { - ret = convert_free_space_to_bitmaps(&trans, cache, path); + ret = btrfs_convert_free_space_to_bitmaps(&trans, cache, path); if (ret) { test_err("could not convert block group to bitmaps"); goto out; @@ -501,7 +498,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, if (ret) goto out; - ret = remove_block_group_free_space(&trans, cache); + ret = btrfs_remove_block_group_free_space(&trans, cache); if (ret) { test_err("could not remove block group free space"); goto out; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index a29d2c02c2c8..a4c2b7748b95 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -950,10 +950,10 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* [BTRFS_MAX_EXTENT_SIZE/2][sectorsize HOLE][the rest] */ - ret = btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree, - BTRFS_MAX_EXTENT_SIZE >> 1, - (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW); + ret = btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE >> 1, + (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1017,10 +1017,10 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */ - ret = btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree, - BTRFS_MAX_EXTENT_SIZE + sectorsize, - BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW); + ret = btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1051,8 +1051,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* Empty */ - ret = btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW); + ret = btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1066,8 +1066,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = 0; out: if (ret) - btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b96195d6480f..c5c0d9cf1a80 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1211,15 +1211,15 @@ static int btrfs_wait_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages) { bool errors = false; - int err; + int ret; - err = __btrfs_wait_marked_extents(fs_info, dirty_pages); + ret = __btrfs_wait_marked_extents(fs_info, dirty_pages); if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags)) errors = true; - if (errors && !err) - err = -EIO; - return err; + if (errors && !ret) + ret = -EIO; + return ret; } int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark) @@ -1227,22 +1227,22 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark) struct btrfs_fs_info *fs_info = log_root->fs_info; struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages; bool errors = false; - int err; + int ret; ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID); - err = __btrfs_wait_marked_extents(fs_info, dirty_pages); - if ((mark & EXTENT_DIRTY) && + ret = __btrfs_wait_marked_extents(fs_info, dirty_pages); + if ((mark & EXTENT_DIRTY_LOG1) && test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags)) errors = true; - if ((mark & EXTENT_NEW) && + if ((mark & EXTENT_DIRTY_LOG2) && test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags)) errors = true; - if (errors && !err) - err = -EIO; - return err; + if (errors && !ret) + ret = -EIO; + return ret; } /* @@ -1735,8 +1735,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_create_qgroup(trans, objectid); if (ret && ret != -EEXIST) { - btrfs_abort_transaction(trans, ret); - goto fail; + if (ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info)) { + btrfs_abort_transaction(trans, ret); + goto fail; + } } /* @@ -2163,13 +2165,19 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans) list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); } -static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval) +static void update_commit_stats(struct btrfs_fs_info *fs_info) { + ktime_t now = ktime_get_ns(); + ktime_t interval = now - fs_info->commit_stats.critical_section_start_time; + + ASSERT(fs_info->commit_stats.critical_section_start_time); + fs_info->commit_stats.commit_count++; fs_info->commit_stats.last_commit_dur = interval; fs_info->commit_stats.max_commit_dur = max_t(u64, fs_info->commit_stats.max_commit_dur, interval); fs_info->commit_stats.total_commit_dur += interval; + fs_info->commit_stats.critical_section_start_time = 0; } int btrfs_commit_transaction(struct btrfs_trans_handle *trans) @@ -2178,8 +2186,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; int ret; - ktime_t start_time; - ktime_t interval; ASSERT(refcount_read(&trans->use_count) == 1); btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); @@ -2312,8 +2318,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * Get the time spent on the work done by the commit thread and not * the time spent waiting on a previous commit */ - start_time = ktime_get_ns(); - + fs_info->commit_stats.critical_section_start_time = ktime_get_ns(); extwriter_counter_dec(cur_trans, trans->type); ret = btrfs_start_delalloc_flush(fs_info); @@ -2545,6 +2550,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) goto scrub_continue; + update_commit_stats(fs_info); /* * We needn't acquire the lock here because there is no other task * which can change it. @@ -2581,8 +2587,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) trace_btrfs_transaction_commit(fs_info); - interval = ktime_get_ns() - start_time; - btrfs_scrub_continue(fs_info); if (current->journal_info == trans) @@ -2590,8 +2594,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) kmem_cache_free(btrfs_trans_handle_cachep, trans); - update_commit_stats(fs_info, interval); - return ret; unlock_reloc: diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 8f4703b488b7..0f556f4de3f9 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -191,7 +191,7 @@ static bool check_prev_ino(struct extent_buffer *leaf, * Only subvolume trees along with their reloc trees need this check. * Things like log tree doesn't follow this ino requirement. */ - if (!is_fstree(btrfs_header_owner(leaf))) + if (!btrfs_is_fstree(btrfs_header_owner(leaf))) return true; if (key->objectid == prev_key->objectid) @@ -475,7 +475,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key, * to be COWed to be relocated. */ if (unlikely(is_root_item && key->objectid == BTRFS_TREE_RELOC_OBJECTID && - !is_fstree(key->offset))) { + !btrfs_is_fstree(key->offset))) { generic_err(leaf, slot, "invalid reloc tree for root %lld, root id is not a subvolume tree", key->offset); @@ -493,7 +493,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key, } /* DIR_ITEM/INDEX/INODE_REF is not allowed to point to non-fs trees */ - if (unlikely(!is_fstree(key->objectid) && !is_root_item)) { + if (unlikely(!btrfs_is_fstree(key->objectid) && !is_root_item)) { dir_item_err(leaf, slot, "invalid location key objectid, have %llu expect [%llu, %llu]", key->objectid, BTRFS_FIRST_FREE_OBJECTID, @@ -1311,7 +1311,7 @@ static bool is_valid_dref_root(u64 rootid) * - tree root * For v1 space cache */ - return is_fstree(rootid) || rootid == BTRFS_DATA_RELOC_TREE_OBJECTID || + return btrfs_is_fstree(rootid) || rootid == BTRFS_DATA_RELOC_TREE_OBJECTID || rootid == BTRFS_ROOT_TREE_OBJECTID; } @@ -2167,7 +2167,7 @@ ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO); int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) { - const bool is_subvol = is_fstree(root_owner); + const bool is_subvol = btrfs_is_fstree(root_owner); const u64 eb_owner = btrfs_header_owner(eb); /* @@ -2209,7 +2209,7 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) * For subvolume trees, owners can mismatch, but they should all belong * to subvolume trees. */ - if (unlikely(is_subvol != is_fstree(eb_owner))) { + if (unlikely(is_subvol != btrfs_is_fstree(eb_owner))) { btrfs_crit(eb->fs_info, "corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect [%llu, %llu]", btrfs_header_level(eb) == 0 ? "leaf" : "node", diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 97e933113b82..9f05d454b9df 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -112,7 +112,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, - u64 dirid, int del_all); + u64 dirid, bool del_all); static void wait_log_commit(struct btrfs_root *root, int transid); /* @@ -143,6 +143,9 @@ static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *r unsigned int nofs_flag; struct btrfs_inode *inode; + /* Only meant to be called for subvolume roots and not for log roots. */ + ASSERT(btrfs_is_fstree(btrfs_root_id(root))); + /* * We're holding a transaction handle whether we are logging or * replaying a log tree, so we must make sure NOFS semantics apply @@ -604,21 +607,6 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, return 0; } -/* - * simple helper to read an inode off the disk from a given root - * This can only be called for subvolume roots and not for the log - */ -static noinline struct btrfs_inode *read_one_inode(struct btrfs_root *root, - u64 objectid) -{ - struct btrfs_inode *inode; - - inode = btrfs_iget_logging(objectid, root); - if (IS_ERR(inode)) - return NULL; - return inode; -} - /* replays a single extent in 'eb' at 'slot' with 'key' into the * subvolume 'root'. path is released on entry and should be released * on exit. @@ -668,15 +656,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, extent_end = ALIGN(start + size, fs_info->sectorsize); } else { - ret = 0; - goto out; + btrfs_err(fs_info, + "unexpected extent type=%d root=%llu inode=%llu offset=%llu", + found_type, btrfs_root_id(root), key->objectid, key->offset); + return -EUCLEAN; } - inode = read_one_inode(root, key->objectid); - if (!inode) { - ret = -EIO; - goto out; - } + inode = btrfs_iget_logging(key->objectid, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); /* * first check to see if we already have this extent in the @@ -948,9 +936,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, btrfs_release_path(path); - inode = read_one_inode(root, location.objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(location.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; goto out; } @@ -961,7 +950,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, ret = unlink_inode_for_log_replay(trans, dir, inode, &name); out: kfree(name.name); - iput(&inode->vfs_inode); + if (inode) + iput(&inode->vfs_inode); return ret; } @@ -1051,6 +1041,126 @@ out: return ret; } +static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_root *log_root, + struct btrfs_key *search_key, + struct btrfs_inode *dir, + struct btrfs_inode *inode, + u64 parent_objectid) +{ + struct extent_buffer *leaf = path->nodes[0]; + unsigned long ptr; + unsigned long ptr_end; + + /* + * Check all the names in this back reference to see if they are in the + * log. If so, we allow them to stay otherwise they must be unlinked as + * a conflict. + */ + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); + while (ptr < ptr_end) { + struct fscrypt_str victim_name; + struct btrfs_inode_ref *victim_ref; + int ret; + + victim_ref = (struct btrfs_inode_ref *)ptr; + ret = read_alloc_one_name(leaf, (victim_ref + 1), + btrfs_inode_ref_name_len(leaf, victim_ref), + &victim_name); + if (ret) + return ret; + + ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name); + if (ret) { + kfree(victim_name.name); + if (ret < 0) + return ret; + ptr = (unsigned long)(victim_ref + 1) + victim_name.len; + continue; + } + + inc_nlink(&inode->vfs_inode); + btrfs_release_path(path); + + ret = unlink_inode_for_log_replay(trans, dir, inode, &victim_name); + kfree(victim_name.name); + if (ret) + return ret; + return -EAGAIN; + } + + return 0; +} + +static int unlink_extrefs_not_in_log(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_root *root, + struct btrfs_root *log_root, + struct btrfs_key *search_key, + struct btrfs_inode *inode, + u64 inode_objectid, + u64 parent_objectid) +{ + struct extent_buffer *leaf = path->nodes[0]; + const unsigned long base = btrfs_item_ptr_offset(leaf, path->slots[0]); + const u32 item_size = btrfs_item_size(leaf, path->slots[0]); + u32 cur_offset = 0; + + while (cur_offset < item_size) { + struct btrfs_inode_extref *extref; + struct btrfs_inode *victim_parent; + struct fscrypt_str victim_name; + int ret; + + extref = (struct btrfs_inode_extref *)(base + cur_offset); + victim_name.len = btrfs_inode_extref_name_len(leaf, extref); + + if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) + goto next; + + ret = read_alloc_one_name(leaf, &extref->name, victim_name.len, + &victim_name); + if (ret) + return ret; + + search_key->objectid = inode_objectid; + search_key->type = BTRFS_INODE_EXTREF_KEY; + search_key->offset = btrfs_extref_hash(parent_objectid, + victim_name.name, + victim_name.len); + ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name); + if (ret) { + kfree(victim_name.name); + if (ret < 0) + return ret; +next: + cur_offset += victim_name.len + sizeof(*extref); + continue; + } + + victim_parent = btrfs_iget_logging(parent_objectid, root); + if (IS_ERR(victim_parent)) { + kfree(victim_name.name); + return PTR_ERR(victim_parent); + } + + inc_nlink(&inode->vfs_inode); + btrfs_release_path(path); + + ret = unlink_inode_for_log_replay(trans, victim_parent, inode, + &victim_name); + iput(&victim_parent->vfs_inode); + kfree(victim_name.name); + if (ret) + return ret; + return -EAGAIN; + } + + return 0; +} + static inline int __add_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -1061,7 +1171,6 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, u64 ref_index, struct fscrypt_str *name) { int ret; - struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key search_key; struct btrfs_inode_extref *extref; @@ -1072,121 +1181,37 @@ again: search_key.type = BTRFS_INODE_REF_KEY; search_key.offset = parent_objectid; ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); - if (ret == 0) { - struct btrfs_inode_ref *victim_ref; - unsigned long ptr; - unsigned long ptr_end; - - leaf = path->nodes[0]; - - /* are we trying to overwrite a back ref for the root directory - * if so, just jump out, we're done + if (ret < 0) { + return ret; + } else if (ret == 0) { + /* + * Are we trying to overwrite a back ref for the root directory? + * If so, we're done. */ if (search_key.objectid == search_key.offset) return 1; - /* check all the names in this back reference to see - * if they are in the log. if so, we allow them to stay - * otherwise they must be unlinked as a conflict - */ - ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); - ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); - while (ptr < ptr_end) { - struct fscrypt_str victim_name; - - victim_ref = (struct btrfs_inode_ref *)ptr; - ret = read_alloc_one_name(leaf, (victim_ref + 1), - btrfs_inode_ref_name_len(leaf, victim_ref), - &victim_name); - if (ret) - return ret; - - ret = backref_in_log(log_root, &search_key, - parent_objectid, &victim_name); - if (ret < 0) { - kfree(victim_name.name); - return ret; - } else if (!ret) { - inc_nlink(&inode->vfs_inode); - btrfs_release_path(path); - - ret = unlink_inode_for_log_replay(trans, dir, inode, - &victim_name); - kfree(victim_name.name); - if (ret) - return ret; - goto again; - } - kfree(victim_name.name); - - ptr = (unsigned long)(victim_ref + 1) + victim_name.len; - } + ret = unlink_refs_not_in_log(trans, path, log_root, &search_key, + dir, inode, parent_objectid); + if (ret == -EAGAIN) + goto again; + else if (ret) + return ret; } btrfs_release_path(path); /* Same search but for extended refs */ - extref = btrfs_lookup_inode_extref(NULL, root, path, name, - inode_objectid, parent_objectid, 0, - 0); + extref = btrfs_lookup_inode_extref(root, path, name, inode_objectid, parent_objectid); if (IS_ERR(extref)) { return PTR_ERR(extref); } else if (extref) { - u32 item_size; - u32 cur_offset = 0; - unsigned long base; - struct btrfs_inode *victim_parent; - - leaf = path->nodes[0]; - - item_size = btrfs_item_size(leaf, path->slots[0]); - base = btrfs_item_ptr_offset(leaf, path->slots[0]); - - while (cur_offset < item_size) { - struct fscrypt_str victim_name; - - extref = (struct btrfs_inode_extref *)(base + cur_offset); - - if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) - goto next; - - ret = read_alloc_one_name(leaf, &extref->name, - btrfs_inode_extref_name_len(leaf, extref), - &victim_name); - if (ret) - return ret; - - search_key.objectid = inode_objectid; - search_key.type = BTRFS_INODE_EXTREF_KEY; - search_key.offset = btrfs_extref_hash(parent_objectid, - victim_name.name, - victim_name.len); - ret = backref_in_log(log_root, &search_key, - parent_objectid, &victim_name); - if (ret < 0) { - kfree(victim_name.name); - return ret; - } else if (!ret) { - ret = -ENOENT; - victim_parent = read_one_inode(root, - parent_objectid); - if (victim_parent) { - inc_nlink(&inode->vfs_inode); - btrfs_release_path(path); - - ret = unlink_inode_for_log_replay(trans, - victim_parent, - inode, &victim_name); - } - iput(&victim_parent->vfs_inode); - kfree(victim_name.name); - if (ret) - return ret; - goto again; - } - kfree(victim_name.name); -next: - cur_offset += victim_name.len + sizeof(*extref); - } + ret = unlink_extrefs_not_in_log(trans, path, root, log_root, + &search_key, inode, + inode_objectid, parent_objectid); + if (ret == -EAGAIN) + goto again; + else if (ret) + return ret; } btrfs_release_path(path); @@ -1314,9 +1339,9 @@ again: struct btrfs_inode *dir; btrfs_release_path(path); - dir = read_one_inode(root, parent_id); - if (!dir) { - ret = -ENOENT; + dir = btrfs_iget_logging(parent_id, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); kfree(name.name); goto out; } @@ -1360,7 +1385,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, unsigned long ref_end; struct fscrypt_str name = { 0 }; int ret; - int log_ref_ver = 0; + const bool is_extref_item = (key->type == BTRFS_INODE_EXTREF_KEY); u64 parent_objectid; u64 inode_objectid; u64 ref_index = 0; @@ -1369,11 +1394,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ref_ptr = btrfs_item_ptr_offset(eb, slot); ref_end = ref_ptr + btrfs_item_size(eb, slot); - if (key->type == BTRFS_INODE_EXTREF_KEY) { + if (is_extref_item) { struct btrfs_inode_extref *r; ref_struct_size = sizeof(struct btrfs_inode_extref); - log_ref_ver = 1; r = (struct btrfs_inode_extref *)ref_ptr; parent_objectid = btrfs_inode_extref_parent(eb, r); } else { @@ -1388,37 +1412,61 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * copy the back ref in. The link count fixup code will take * care of the rest */ - dir = read_one_inode(root, parent_objectid); - if (!dir) { - ret = -ENOENT; + dir = btrfs_iget_logging(parent_objectid, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + if (ret == -ENOENT) + ret = 0; + dir = NULL; goto out; } - inode = read_one_inode(root, inode_objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(inode_objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; goto out; } while (ref_ptr < ref_end) { - if (log_ref_ver) { + if (is_extref_item) { ret = extref_get_fields(eb, ref_ptr, &name, &ref_index, &parent_objectid); + if (ret) + goto out; /* * parent object can change from one array * item to another. */ - if (!dir) - dir = read_one_inode(root, parent_objectid); if (!dir) { - ret = -ENOENT; - goto out; + dir = btrfs_iget_logging(parent_objectid, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + dir = NULL; + /* + * A new parent dir may have not been + * logged and not exist in the subvolume + * tree, see the comment above before + * the loop when getting the first + * parent dir. + */ + if (ret == -ENOENT) { + /* + * The next extref may refer to + * another parent dir that + * exists, so continue. + */ + ret = 0; + goto next; + } + goto out; + } } } else { ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); + if (ret) + goto out; } - if (ret) - goto out; ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), ref_index, &name); @@ -1452,10 +1500,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, } /* Else, ret == 1, we already have a perfect match, we're done. */ +next: ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len; kfree(name.name); name.name = NULL; - if (log_ref_ver) { + if (is_extref_item && dir) { iput(&dir->vfs_inode); dir = NULL; } @@ -1632,8 +1681,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (inode->vfs_inode.i_nlink == 0) { if (S_ISDIR(inode->vfs_inode.i_mode)) { - ret = replay_dir_deletes(trans, root, NULL, path, - ino, 1); + ret = replay_dir_deletes(trans, root, NULL, path, ino, true); if (ret) goto out; } @@ -1681,9 +1729,9 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, break; btrfs_release_path(path); - inode = read_one_inode(root, key.offset); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(key.offset, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); break; } @@ -1719,9 +1767,9 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_inode *inode; struct inode *vfs_inode; - inode = read_one_inode(root, objectid); - if (!inode) - return -EIO; + inode = btrfs_iget_logging(objectid, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); vfs_inode = &inode->vfs_inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; @@ -1760,14 +1808,14 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, struct btrfs_inode *dir; int ret; - inode = read_one_inode(root, location->objectid); - if (!inode) - return -ENOENT; + inode = btrfs_iget_logging(location->objectid, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); - dir = read_one_inode(root, dirid); - if (!dir) { + dir = btrfs_iget_logging(dirid, root); + if (IS_ERR(dir)) { iput(&inode->vfs_inode); - return -EIO; + return PTR_ERR(dir); } ret = btrfs_add_link(trans, dir, inode, name, 1, index); @@ -1844,9 +1892,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, bool update_size = true; bool name_added = false; - dir = read_one_inode(root, key->objectid); - if (!dir) - return -EIO; + dir = btrfs_iget_logging(key->objectid, root); + if (IS_ERR(dir)) + return PTR_ERR(dir); ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); if (ret) @@ -2146,9 +2194,10 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, btrfs_dir_item_key_to_cpu(eb, di, &location); btrfs_release_path(path); btrfs_release_path(log_path); - inode = read_one_inode(root, location.objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(location.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; goto out; } @@ -2284,7 +2333,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, - u64 dirid, int del_all) + u64 dirid, bool del_all) { u64 range_start; u64 range_end; @@ -2300,14 +2349,17 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, if (!log_path) return -ENOMEM; - dir = read_one_inode(root, dirid); - /* it isn't an error if the inode isn't there, that can happen - * because we replay the deletes before we copy in the inode item - * from the log + dir = btrfs_iget_logging(dirid, root); + /* + * It isn't an error if the inode isn't there, that can happen because + * we replay the deletes before we copy in the inode item from the log. */ - if (!dir) { + if (IS_ERR(dir)) { btrfs_free_path(log_path); - return 0; + ret = PTR_ERR(dir); + if (ret == -ENOENT) + ret = 0; + return ret; } range_start = 0; @@ -2443,8 +2495,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, break; mode = btrfs_inode_mode(eb, inode_item); if (S_ISDIR(mode)) { - ret = replay_dir_deletes(wc->trans, - root, log, path, key.objectid, 0); + ret = replay_dir_deletes(wc->trans, root, log, path, + key.objectid, false); if (ret) break; } @@ -2466,9 +2518,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct btrfs_inode *inode; u64 from; - inode = read_one_inode(root, key.objectid); - if (!inode) { - ret = -EIO; + inode = btrfs_iget_logging(key.objectid, root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); break; } from = ALIGN(i_size_read(&inode->vfs_inode), @@ -2519,9 +2571,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, key.type == BTRFS_INODE_EXTREF_KEY) { ret = add_inode_ref(wc->trans, root, log, path, eb, i, &key); - if (ret && ret != -ENOENT) + if (ret) break; - ret = 0; } else if (key.type == BTRFS_EXTENT_DATA_KEY) { ret = replay_one_extent(wc->trans, root, path, eb, i, &key); @@ -2720,7 +2771,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, level = btrfs_header_level(log->node); orig_level = level; path->nodes[level] = log->node; - atomic_inc(&log->node->refs); + refcount_inc(&log->node->refs); path->slots[level] = 0; while (1) { @@ -2961,9 +3012,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } if (log_transid % 2 == 0) - mark = EXTENT_DIRTY; + mark = EXTENT_DIRTY_LOG1; else - mark = EXTENT_NEW; + mark = EXTENT_DIRTY_LOG2; /* we start IO on all the marked extents here, but we don't actually * wait for them until later. @@ -3094,7 +3145,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_write_marked_extents(fs_info, &log_root_tree->dirty_log_pages, - EXTENT_DIRTY | EXTENT_NEW); + EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2); blk_finish_plug(&plug); /* * As described above, -EAGAIN indicates a hole in the extents. We @@ -3114,7 +3165,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_wait_tree_log_extents(log, mark); if (!ret) ret = btrfs_wait_tree_log_extents(log_root_tree, - EXTENT_NEW | EXTENT_DIRTY); + EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2); if (ret) { btrfs_set_log_full_commit(trans); mutex_unlock(&log_root_tree->log_mutex); @@ -3240,9 +3291,9 @@ static void free_log_tree(struct btrfs_trans_handle *trans, */ btrfs_write_marked_extents(log->fs_info, &log->dirty_log_pages, - EXTENT_DIRTY | EXTENT_NEW); + EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2); btrfs_wait_tree_log_extents(log, - EXTENT_DIRTY | EXTENT_NEW); + EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2); if (trans) btrfs_abort_transaction(trans, ret); @@ -3432,7 +3483,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans, * inode item because on log replay we update the field to reflect * all existing entries in the directory (see overwrite_item()). */ - return btrfs_delete_one_dir_name(trans, log, path, di); + return btrfs_del_item(trans, log, path); } /* @@ -3472,26 +3523,27 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, return; } - ret = join_running_log_trans(root); - if (ret) - return; - - mutex_lock(&dir->log_mutex); - path = btrfs_alloc_path(); if (!path) { - ret = -ENOMEM; - goto out_unlock; + btrfs_set_log_full_commit(trans); + return; } + ret = join_running_log_trans(root); + ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret); + if (WARN_ON(ret)) + goto out; + + mutex_lock(&dir->log_mutex); + ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir), name, index); - btrfs_free_path(path); -out_unlock: mutex_unlock(&dir->log_mutex); if (ret < 0) btrfs_set_log_full_commit(trans); btrfs_end_log_trans(root); +out: + btrfs_free_path(path); } /* see comments for btrfs_del_dir_entries_in_log */ @@ -3501,7 +3553,6 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 dirid) { struct btrfs_root *log; - u64 index; int ret; ret = inode_logged(trans, inode, NULL); @@ -3513,13 +3564,13 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, } ret = join_running_log_trans(root); - if (ret) + ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret); + if (WARN_ON(ret)) return; log = root->log_root; mutex_lock(&inode->log_mutex); - ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode), - dirid, &index); + ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode), dirid, NULL); mutex_unlock(&inode->log_mutex); if (ret < 0 && ret != -ENOENT) btrfs_set_log_full_commit(trans); @@ -3684,7 +3735,7 @@ static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx) * Add extra ref to scratch eb so that it is not freed when callers * release the path, so we can reuse it later if needed. */ - atomic_inc(&ctx->scratch_eb->refs); + refcount_inc(&ctx->scratch_eb->refs); return 0; } @@ -4172,44 +4223,37 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct inode *inode, int log_inode_only, u64 logged_isize) { - struct btrfs_map_token token; u64 flags; - btrfs_init_map_token(&token, leaf); - if (log_inode_only) { /* set the generation to zero so the recover code * can tell the difference between an logging * just to say 'this inode exists' and a logging * to say 'update this inode with these values' */ - btrfs_set_token_inode_generation(&token, item, 0); - btrfs_set_token_inode_size(&token, item, logged_isize); + btrfs_set_inode_generation(leaf, item, 0); + btrfs_set_inode_size(leaf, item, logged_isize); } else { - btrfs_set_token_inode_generation(&token, item, - BTRFS_I(inode)->generation); - btrfs_set_token_inode_size(&token, item, inode->i_size); + btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); + btrfs_set_inode_size(leaf, item, inode->i_size); } - btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); - btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); - btrfs_set_token_inode_mode(&token, item, inode->i_mode); - btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); + btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); + btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); + btrfs_set_inode_mode(leaf, item, inode->i_mode); + btrfs_set_inode_nlink(leaf, item, inode->i_nlink); - btrfs_set_token_timespec_sec(&token, &item->atime, - inode_get_atime_sec(inode)); - btrfs_set_token_timespec_nsec(&token, &item->atime, - inode_get_atime_nsec(inode)); + btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode)); + btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode)); - btrfs_set_token_timespec_sec(&token, &item->mtime, - inode_get_mtime_sec(inode)); - btrfs_set_token_timespec_nsec(&token, &item->mtime, - inode_get_mtime_nsec(inode)); + btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode)); + btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode)); - btrfs_set_token_timespec_sec(&token, &item->ctime, - inode_get_ctime_sec(inode)); - btrfs_set_token_timespec_nsec(&token, &item->ctime, - inode_get_ctime_nsec(inode)); + btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode)); + btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode)); + + btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec); + btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec); /* * We do not need to set the nbytes field, in fact during a fast fsync @@ -4220,13 +4264,13 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, * inode item in subvolume tree as needed (see overwrite_item()). */ - btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); - btrfs_set_token_inode_transid(&token, item, trans->transid); - btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); + btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode)); + btrfs_set_inode_transid(leaf, item, trans->transid); + btrfs_set_inode_rdev(leaf, item, inode->i_rdev); flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, BTRFS_I(inode)->ro_flags); - btrfs_set_token_inode_flags(&token, item, flags); - btrfs_set_token_inode_block_group(&token, item, 0); + btrfs_set_inode_flags(leaf, item, flags); + btrfs_set_inode_block_group(leaf, item, 0); } static int log_inode_item(struct btrfs_trans_handle *trans, @@ -7192,8 +7236,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) struct btrfs_path *path; struct btrfs_trans_handle *trans; struct btrfs_key key; - struct btrfs_key found_key; - struct btrfs_root *log; struct btrfs_fs_info *fs_info = log_root_tree->fs_info; struct walk_control wc = { .process_func = process_one_buffer, @@ -7227,6 +7269,9 @@ again: key.offset = (u64)-1; while (1) { + struct btrfs_root *log; + struct btrfs_key found_key; + ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); if (ret < 0) { @@ -7255,6 +7300,12 @@ again: true); if (IS_ERR(wc.replay_dest)) { ret = PTR_ERR(wc.replay_dest); + wc.replay_dest = NULL; + if (ret != -ENOENT) { + btrfs_put_root(log); + btrfs_abort_transaction(trans, ret); + goto error; + } /* * We didn't find the subvol, likely because it was @@ -7267,36 +7318,36 @@ again: * block from being modified, and we'll just bail for * each subsequent pass. */ - if (ret == -ENOENT) - ret = btrfs_pin_extent_for_log_replay(trans, log->node); - btrfs_put_root(log); - - if (!ret) - goto next; - btrfs_abort_transaction(trans, ret); - goto error; + ret = btrfs_pin_extent_for_log_replay(trans, log->node); + if (ret) { + btrfs_put_root(log); + btrfs_abort_transaction(trans, ret); + goto error; + } + goto next; } wc.replay_dest->log_root = log; ret = btrfs_record_root_in_trans(trans, wc.replay_dest); - if (ret) - /* The loop needs to continue due to the root refs */ + if (ret) { btrfs_abort_transaction(trans, ret); - else - ret = walk_log_tree(trans, log, &wc); + goto next; + } - if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { - ret = fixup_inode_link_counts(trans, wc.replay_dest, - path); - if (ret) - btrfs_abort_transaction(trans, ret); + ret = walk_log_tree(trans, log, &wc); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto next; } - if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { + if (wc.stage == LOG_WALK_REPLAY_ALL) { struct btrfs_root *root = wc.replay_dest; - btrfs_release_path(path); - + ret = fixup_inode_link_counts(trans, wc.replay_dest, path); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto next; + } /* * We have just replayed everything, and the highest * objectid of fs roots probably has changed in case @@ -7306,17 +7357,20 @@ again: * could only happen during mount. */ ret = btrfs_init_root_free_objectid(root); - if (ret) + if (ret) { btrfs_abort_transaction(trans, ret); + goto next; + } + } +next: + if (wc.replay_dest) { + wc.replay_dest->log_root = NULL; + btrfs_put_root(wc.replay_dest); } - - wc.replay_dest->log_root = NULL; - btrfs_put_root(wc.replay_dest); btrfs_put_root(log); if (ret) goto error; -next: if (found_key.offset == 0) break; key.offset = found_key.offset - 1; @@ -7447,6 +7501,8 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, * full log sync. * Also we don't need to worry with renames, since btrfs_rename() marks the log * for full commit when renaming a subvolume. + * + * Must be called before creating the subvolume entry in its parent directory. */ void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans, struct btrfs_inode *dir) @@ -7483,6 +7539,9 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, bool log_pinned = false; int ret; + btrfs_init_log_ctx(&ctx, inode); + ctx.logging_new_name = true; + /* * this will force the logging code to walk the dentry chain * up for the file @@ -7514,6 +7573,13 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, ret = 0; /* + * Now that we know we need to update the log, allocate the scratch eb + * for the context before joining a log transaction below, as this can + * take time and therefore we could delay log commits from other tasks. + */ + btrfs_init_log_ctx_scratch_eb(&ctx); + + /* * If we are doing a rename (old_dir is not NULL) from a directory that * was previously logged, make sure that on log replay we get the old * dir entry deleted. This is needed because we will also log the new @@ -7531,6 +7597,14 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, &old_dentry->d_name, 0, &fname); if (ret) goto out; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + fscrypt_free_filename(&fname); + goto out; + } + /* * We have two inodes to update in the log, the old directory and * the inode that got renamed, so we must pin the log to prevent @@ -7544,19 +7618,13 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * mark the log for a full commit. */ if (WARN_ON_ONCE(ret < 0)) { + btrfs_free_path(path); fscrypt_free_filename(&fname); goto out; } log_pinned = true; - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - fscrypt_free_filename(&fname); - goto out; - } - /* * Other concurrent task might be logging the old directory, * as it can be triggered when logging other inode that had or @@ -7588,9 +7656,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, goto out; } - btrfs_init_log_ctx(&ctx, inode); - ctx.logging_new_name = true; - btrfs_init_log_ctx_scratch_eb(&ctx); /* * We don't care about the return value. If we fail to log the new name * then we know the next attempt to sync the log will fallback to a full @@ -7599,7 +7664,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * inconsistent state after a rename operation. */ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); - free_extent_buffer(ctx.scratch_eb); ASSERT(list_empty(&ctx.conflict_inodes)); out: /* @@ -7612,5 +7676,6 @@ out: btrfs_set_log_full_commit(trans); if (log_pinned) btrfs_end_log_trans(root); + free_extent_buffer(ctx.scratch_eb); } diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 1ac2678fc4ca..9e8cb3b7c064 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -27,18 +27,29 @@ struct tree_mod_elem { /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */ u64 generation; - /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */ - struct btrfs_disk_key key; - u64 blockptr; - - /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */ - struct { - int dst_slot; - int nr_items; - } move; - - /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */ - struct tree_mod_root old_root; + union { + /* + * This is used for the following op types: + * + * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING + * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING + * BTRFS_MOD_LOG_KEY_REMOVE + * BTRFS_MOD_LOG_KEY_REPLACE + */ + struct { + struct btrfs_disk_key key; + u64 blockptr; + } slot_change; + + /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */ + struct { + int dst_slot; + int nr_items; + } move; + + /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */ + struct tree_mod_root old_root; + }; }; /* @@ -164,6 +175,30 @@ static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info, return 0; } +static inline bool skip_eb_logging(const struct extent_buffer *eb) +{ + const u64 owner = btrfs_header_owner(eb); + + if (btrfs_header_level(eb) == 0) + return true; + + /* + * Tree mod logging exists so that there's a consistent view of the + * extents and backrefs of inodes even if while a task is iterating over + * them other tasks are modifying subvolume trees and the extent tree + * (including running delayed refs). So we only need to log extent + * buffers from the extent tree and subvolume trees. + */ + + if (owner == BTRFS_EXTENT_TREE_OBJECTID) + return false; + + if (btrfs_is_fstree(owner)) + return false; + + return true; +} + /* * Determines if logging can be omitted. Returns true if it can. Otherwise, it * returns false with the tree_mod_log_lock acquired. The caller must hold @@ -174,7 +209,7 @@ static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, const struct extent { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) return true; - if (eb && btrfs_header_level(eb) == 0) + if (eb && skip_eb_logging(eb)) return true; write_lock(&fs_info->tree_mod_log_lock); @@ -192,7 +227,7 @@ static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) return false; - if (eb && btrfs_header_level(eb) == 0) + if (eb && skip_eb_logging(eb)) return false; return true; @@ -204,15 +239,17 @@ static struct tree_mod_elem *alloc_tree_mod_elem(const struct extent_buffer *eb, { struct tree_mod_elem *tm; + /* Can't be one of these types, due to union in struct tree_mod_elem. */ + ASSERT(op != BTRFS_MOD_LOG_MOVE_KEYS); + ASSERT(op != BTRFS_MOD_LOG_ROOT_REPLACE); + tm = kzalloc(sizeof(*tm), GFP_NOFS); if (!tm) return NULL; tm->logical = eb->start; - if (op != BTRFS_MOD_LOG_KEY_ADD) { - btrfs_node_key(eb, &tm->key, slot); - tm->blockptr = btrfs_node_blockptr(eb, slot); - } + btrfs_node_key(eb, &tm->slot_change.key, slot); + tm->slot_change.blockptr = btrfs_node_blockptr(eb, slot); tm->op = op; tm->slot = slot; tm->generation = btrfs_node_ptr_generation(eb, slot); @@ -830,8 +867,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, fallthrough; case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING: case BTRFS_MOD_LOG_KEY_REMOVE: - btrfs_set_node_key(eb, &tm->key, tm->slot); - btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr); btrfs_set_node_ptr_generation(eb, tm->slot, tm->generation); n++; @@ -840,8 +877,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, break; case BTRFS_MOD_LOG_KEY_REPLACE: BUG_ON(tm->slot >= n); - btrfs_set_node_key(eb, &tm->key, tm->slot); - btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr); btrfs_set_node_ptr_generation(eb, tm->slot, tm->generation); break; diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index fc59b57257d6..7e16a253fb35 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -129,21 +129,25 @@ void ulist_free(struct ulist *ulist) kfree(ulist); } +static int ulist_node_val_key_cmp(const void *key, const struct rb_node *node) +{ + const u64 *val = key; + const struct ulist_node *unode = rb_entry(node, struct ulist_node, rb_node); + + if (unode->val < *val) + return 1; + else if (unode->val > *val) + return -1; + + return 0; +} + static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val) { - struct rb_node *n = ulist->root.rb_node; - struct ulist_node *u = NULL; - - while (n) { - u = rb_entry(n, struct ulist_node, rb_node); - if (u->val < val) - n = n->rb_right; - else if (u->val > val) - n = n->rb_left; - else - return u; - } - return NULL; + struct rb_node *node; + + node = rb_find(&val, &ulist->root, ulist_node_val_key_cmp); + return rb_entry_safe(node, struct ulist_node, rb_node); } static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node) @@ -155,25 +159,20 @@ static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node) ulist->nnodes--; } +static int ulist_node_val_cmp(struct rb_node *new, const struct rb_node *existing) +{ + const struct ulist_node *unode = rb_entry(new, struct ulist_node, rb_node); + + return ulist_node_val_key_cmp(&unode->val, existing); +} + static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins) { - struct rb_node **p = &ulist->root.rb_node; - struct rb_node *parent = NULL; - struct ulist_node *cur = NULL; - - while (*p) { - parent = *p; - cur = rb_entry(parent, struct ulist_node, rb_node); - - if (cur->val < ins->val) - p = &(*p)->rb_right; - else if (cur->val > ins->val) - p = &(*p)->rb_left; - else - return -EEXIST; - } - rb_link_node(&ins->rb_node, parent, p); - rb_insert_color(&ins->rb_node, &ulist->root); + struct rb_node *node; + + node = rb_find_add(&ins->rb_node, &ulist->root, ulist_node_val_cmp); + if (node) + return -EEXIST; return 0; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 89835071cfea..fa7a929a0461 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -18,7 +18,6 @@ #include "transaction.h" #include "volumes.h" #include "raid56.h" -#include "rcu-string.h" #include "dev-replace.h" #include "sysfs.h" #include "tree-checker.h" @@ -214,10 +213,8 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) u64 flags = bg_flags; u32 size_bp = size_buf; - if (!flags) { - strcpy(bp, "NONE"); + if (!flags) return; - } #define DESCRIBE_FLAG(flag, desc) \ do { \ @@ -403,7 +400,11 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) static void btrfs_free_device(struct btrfs_device *device) { WARN_ON(!list_empty(&device->post_commit_list)); - rcu_string_free(device->name); + /* + * No need to call kfree_rcu() nor do RCU lock/unlock, nothing is + * reading the device name. + */ + kfree(rcu_dereference_raw(device->name)); btrfs_extent_io_tree_release(&device->alloc_state); btrfs_destroy_dev_zone_info(device); kfree(device); @@ -414,6 +415,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) struct btrfs_device *device; WARN_ON(fs_devices->opened); + WARN_ON(fs_devices->holding); while (!list_empty(&fs_devices->devices)) { device = list_first_entry(&fs_devices->devices, struct btrfs_device, dev_list); @@ -473,7 +475,7 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, struct block_device *bdev; int ret; - *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL); + *bdev_file = bdev_file_open_by_path(device_path, flags, holder, &fs_holder_ops); if (IS_ERR(*bdev_file)) { ret = PTR_ERR(*bdev_file); @@ -488,7 +490,7 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, if (holder) { ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE); if (ret) { - fput(*bdev_file); + bdev_fput(*bdev_file); goto error; } } @@ -496,7 +498,7 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, *disk_super = btrfs_read_disk_super(bdev, 0, false); if (IS_ERR(*disk_super)) { ret = PTR_ERR(*disk_super); - fput(*bdev_file); + bdev_fput(*bdev_file); goto error; } @@ -541,7 +543,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device continue; if (devt && devt != device->devt) continue; - if (fs_devices->opened) { + if (fs_devices->opened || fs_devices->holding) { if (devt) ret = -EBUSY; break; @@ -657,7 +659,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, if (!device->name) return -EINVAL; - ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, + ret = btrfs_get_bdev_and_sb(rcu_dereference_raw(device->name), flags, holder, 1, &bdev_file, &disk_super); if (ret) return ret; @@ -674,8 +676,8 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { if (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { - pr_err( - "BTRFS: Invalid seeding and uuid-changed device detected\n"); + btrfs_err(NULL, + "invalid seeding and uuid-changed device detected"); goto error_free_page; } @@ -701,7 +703,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, if (device->devt != device->bdev->bd_dev) { btrfs_warn(NULL, "device %s maj:min changed from %d:%d to %d:%d", - device->name->str, MAJOR(device->devt), + rcu_dereference_raw(device->name), MAJOR(device->devt), MINOR(device->devt), MAJOR(device->bdev->bd_dev), MINOR(device->bdev->bd_dev)); @@ -720,7 +722,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, error_free_page: btrfs_release_disk_super(disk_super); - fput(bdev_file); + bdev_fput(bdev_file); return -EINVAL; } @@ -749,7 +751,7 @@ static bool is_same_device(struct btrfs_device *device, const char *new_path) goto out; rcu_read_lock(); - ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX); + ret = strscpy(old_path, rcu_dereference(device->name), PATH_MAX); rcu_read_unlock(); if (ret < 0) goto out; @@ -782,11 +784,11 @@ static noinline struct btrfs_device *device_list_add(const char *path, { struct btrfs_device *device; struct btrfs_fs_devices *fs_devices = NULL; - struct rcu_string *name; + const char *name; u64 found_transid = btrfs_super_generation(disk_super); u64 devid = btrfs_stack_device_id(&disk_super->dev_item); dev_t path_devt; - int error; + int ret; bool same_fsid_diff_dev = false; bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID); @@ -798,11 +800,11 @@ static noinline struct btrfs_device *device_list_add(const char *path, return ERR_PTR(-EAGAIN); } - error = lookup_bdev(path, &path_devt); - if (error) { + ret = lookup_bdev(path, &path_devt); + if (ret) { btrfs_err(NULL, "failed to lookup block device for path %s: %d", - path, error); - return ERR_PTR(error); + path, ret); + return ERR_PTR(ret); } fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev); @@ -819,7 +821,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (same_fsid_diff_dev) { generate_random_uuid(fs_devices->fsid); fs_devices->temp_fsid = true; - pr_info("BTRFS: device %s (%d:%d) using temp-fsid %pU\n", + btrfs_info(NULL, "device %s (%d:%d) using temp-fsid %pU", path, MAJOR(path_devt), MINOR(path_devt), fs_devices->fsid); } @@ -890,6 +892,8 @@ static noinline struct btrfs_device *device_list_add(const char *path, current->comm, task_pid_nr(current)); } else if (!device->name || !is_same_device(device, path)) { + const char *old_name; + /* * When FS is already mounted. * 1. If you are here and if the device->name is NULL that @@ -943,27 +947,31 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (device->bdev) { if (device->devt != path_devt) { mutex_unlock(&fs_devices->device_list_mutex); - btrfs_warn_in_rcu(NULL, + btrfs_warn(NULL, "duplicate device %s devid %llu generation %llu scanned by %s (%d)", path, devid, found_transid, current->comm, task_pid_nr(current)); return ERR_PTR(-EEXIST); } - btrfs_info_in_rcu(NULL, + btrfs_info(NULL, "devid %llu device path %s changed to %s scanned by %s (%d)", devid, btrfs_dev_name(device), path, current->comm, task_pid_nr(current)); } - name = rcu_string_strdup(path, GFP_NOFS); + name = kstrdup(path, GFP_NOFS); if (!name) { mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-ENOMEM); } - rcu_string_free(device->name); + rcu_read_lock(); + old_name = rcu_dereference(device->name); + rcu_read_unlock(); rcu_assign_pointer(device->name, name); + kfree_rcu_mightsleep(old_name); + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { fs_devices->missing_devices--; clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); @@ -1012,7 +1020,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) * uuid mutex so nothing we touch in here is going to disappear. */ if (orig_dev->name) - dev_path = orig_dev->name->str; + dev_path = rcu_dereference_raw(orig_dev->name); device = btrfs_alloc_device(NULL, &orig_dev->devid, orig_dev->uuid, dev_path); @@ -1070,7 +1078,7 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, continue; if (device->bdev_file) { - fput(device->bdev_file); + bdev_fput(device->bdev_file); device->bdev = NULL; device->bdev_file = NULL; fs_devices->open_devices--; @@ -1117,7 +1125,7 @@ static void btrfs_close_bdev(struct btrfs_device *device) invalidate_bdev(device->bdev); } - fput(device->bdev_file); + bdev_fput(device->bdev_file); } static void btrfs_close_one_device(struct btrfs_device *device) @@ -1197,7 +1205,7 @@ void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) mutex_lock(&uuid_mutex); close_fs_devices(fs_devices); - if (!fs_devices->opened) { + if (!fs_devices->opened && !fs_devices->holding) { list_splice_init(&fs_devices->seed_list, &list); /* @@ -1414,7 +1422,7 @@ static bool btrfs_skip_registration(struct btrfs_super_block *disk_super, list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->bdev && (device->bdev->bd_dev == devt) && - strcmp(device->name->str, path) != 0) { + strcmp(rcu_dereference_raw(device->name), path) != 0) { mutex_unlock(&fs_devices->device_list_mutex); /* Do not skip registration. */ @@ -1440,7 +1448,7 @@ static bool btrfs_skip_registration(struct btrfs_super_block *disk_super, * the device or return an error. Multi-device and seeding devices are registered * in both cases. */ -struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, +struct btrfs_device *btrfs_scan_one_device(const char *path, bool mount_arg_dev) { struct btrfs_super_block *disk_super; @@ -1461,7 +1469,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, * values temporarily, as the device paths of the fsid are the only * required information for assembling the volume. */ - bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL); + bdev_file = bdev_file_open_by_path(path, BLK_OPEN_READ, NULL, NULL); if (IS_ERR(bdev_file)) return ERR_CAST(bdev_file); @@ -1473,7 +1481,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, devt = file_bdev(bdev_file)->bd_dev; if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) { - pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n", + btrfs_debug(NULL, "skip registering single non-seed device %s (%d:%d)", path, MAJOR(devt), MINOR(devt)); btrfs_free_stale_devices(devt, NULL); @@ -1490,7 +1498,7 @@ free_disk_super: btrfs_release_disk_super(disk_super); error_bdev_put: - fput(bdev_file); + bdev_fput(bdev_file); return device; } @@ -2164,7 +2172,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_devic btrfs_kobject_uevent(bdev, KOBJ_CHANGE); /* Update ctime/mtime for device path for libblkid */ - update_dev_time(device->name->str); + update_dev_time(rcu_dereference_raw(device->name)); } int btrfs_rm_device(struct btrfs_fs_info *fs_info, @@ -2204,7 +2212,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, } if (btrfs_pinned_by_swapfile(fs_info, device)) { - btrfs_warn_in_rcu(fs_info, + btrfs_warn(fs_info, "cannot remove device %s (devid %llu) due to active swapfile", btrfs_dev_name(device), device->devid); return -ETXTBSY; @@ -2294,7 +2302,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, * free the device. * * We cannot call btrfs_close_bdev() here because we're holding the sb - * write lock, and fput() on the block device will pull in the + * write lock, and bdev_fput() on the block device will pull in the * ->open_mutex on the block device and it's dependencies. Instead * just flush the device and let the caller do the final bdev_release. */ @@ -2473,7 +2481,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, else memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); btrfs_release_disk_super(disk_super); - fput(bdev_file); + bdev_fput(bdev_file); return 0; } @@ -2705,7 +2713,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path return -EROFS; bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, - fs_info->bdev_holder, NULL); + fs_info->sb, &fs_holder_ops); if (IS_ERR(bdev_file)) return PTR_ERR(bdev_file); @@ -2921,7 +2929,7 @@ error_free_zone: error_free_device: btrfs_free_device(device); error: - fput(bdev_file); + bdev_fput(bdev_file); if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); @@ -3282,6 +3290,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) device->bytes_used - dev_extent_len); atomic64_add(dev_extent_len, &fs_info->free_chunk_space); btrfs_clear_space_info_full(fs_info); + + if (list_empty(&device->post_commit_list)) { + list_add_tail(&device->post_commit_list, + &trans->transaction->dev_update_list); + } + mutex_unlock(&fs_info->chunk_mutex); } } @@ -3398,7 +3412,8 @@ out: return ret; } -int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) +int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset, + bool verbose) { struct btrfs_root *root = fs_info->chunk_root; struct btrfs_trans_handle *trans; @@ -3428,7 +3443,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) /* step one, relocate all the extents inside this chunk */ btrfs_scrub_pause(fs_info); - ret = btrfs_relocate_block_group(fs_info, chunk_offset); + ret = btrfs_relocate_block_group(fs_info, chunk_offset, true); btrfs_scrub_continue(fs_info); if (ret) { /* @@ -3538,7 +3553,8 @@ again: btrfs_release_path(path); if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { - ret = btrfs_relocate_chunk(fs_info, found_key.offset); + ret = btrfs_relocate_chunk(fs_info, found_key.offset, + true); if (ret == -ENOSPC) failed++; else @@ -4203,7 +4219,7 @@ again: } } - ret = btrfs_relocate_chunk(fs_info, found_key.offset); + ret = btrfs_relocate_chunk(fs_info, found_key.offset, true); mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret == -ENOSPC) { enospc_errors++; @@ -4971,7 +4987,7 @@ again: goto done; } - ret = btrfs_relocate_chunk(fs_info, chunk_offset); + ret = btrfs_relocate_chunk(fs_info, chunk_offset, true); mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret == -ENOSPC) { failed++; @@ -5003,8 +5019,8 @@ again: mutex_lock(&fs_info->chunk_mutex); /* Clear all state bits beyond the shrunk device size */ - btrfs_clear_extent_bits(&device->alloc_state, new_size, (u64)-1, - CHUNK_STATE_MASK); + btrfs_clear_extent_bit(&device->alloc_state, new_size, (u64)-1, + CHUNK_STATE_MASK, NULL); btrfs_device_set_disk_total_bytes(device, new_size); if (list_empty(&device->post_commit_list)) @@ -5431,9 +5447,9 @@ static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned in struct btrfs_io_stripe *stripe = &map->stripes[i]; struct btrfs_device *device = stripe->dev; - btrfs_clear_extent_bits(&device->alloc_state, stripe->physical, - stripe->physical + map->stripe_size - 1, - bits | EXTENT_NOWAIT); + btrfs_clear_extent_bit(&device->alloc_state, stripe->physical, + stripe->physical + map->stripe_size - 1, + bits | EXTENT_NOWAIT, NULL); } } @@ -6917,9 +6933,9 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, generate_random_uuid(dev->uuid); if (path) { - struct rcu_string *name; + const char *name; - name = rcu_string_strdup(path, GFP_KERNEL); + name = kstrdup(path, GFP_KERNEL); if (!name) { btrfs_free_device(dev); return ERR_PTR(-ENOMEM); @@ -7168,7 +7184,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, if (IS_ERR(fs_devices)) return fs_devices; - ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->bdev_holder); + ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->sb); if (ret) { free_fs_devices(fs_devices); return ERR_PTR(ret); @@ -7700,7 +7716,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, return -ENOMEM; ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { - btrfs_warn_in_rcu(fs_info, + btrfs_warn(fs_info, "error %d while searching for dev_stats item for device %s", ret, btrfs_dev_name(device)); goto out; @@ -7711,7 +7727,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, /* need to delete old one and insert a new one */ ret = btrfs_del_item(trans, dev_root, path); if (ret != 0) { - btrfs_warn_in_rcu(fs_info, + btrfs_warn(fs_info, "delete too small dev_stats item for device %s failed %d", btrfs_dev_name(device), ret); goto out; @@ -7725,7 +7741,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, dev_root, path, &key, sizeof(*ptr)); if (ret < 0) { - btrfs_warn_in_rcu(fs_info, + btrfs_warn(fs_info, "insert dev_stats item for device %s failed %d", btrfs_dev_name(device), ret); goto out; @@ -7788,7 +7804,7 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) if (!dev->dev_stats_valid) return; - btrfs_err_rl_in_rcu(dev->fs_info, + btrfs_err_rl(dev->fs_info, "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", btrfs_dev_name(dev), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), @@ -7808,7 +7824,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) if (i == BTRFS_DEV_STAT_VALUES_MAX) return; /* all values == 0, suppress message */ - btrfs_info_in_rcu(dev->fs_info, + btrfs_info(dev->fs_info, "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", btrfs_dev_name(dev), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), @@ -7932,7 +7948,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } /* - * Very old mkfs.btrfs (before v4.1) will not respect the reserved + * Very old mkfs.btrfs (before v4.15) will not respect the reserved * space. Although kernel can handle it without problem, better to warn * the users. */ @@ -8184,7 +8200,7 @@ static int relocating_repair_kthread(void *data) btrfs_info(fs_info, "zoned: relocating block group %llu to repair IO failure", target); - ret = btrfs_relocate_chunk(fs_info, target); + ret = btrfs_relocate_chunk(fs_info, target, true); out: if (cache) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 137cc232f58e..a56e873a3029 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -21,7 +21,6 @@ #include <uapi/linux/btrfs.h> #include <uapi/linux/btrfs_tree.h> #include "messages.h" -#include "rcu-string.h" #include "extent-io-tree.h" struct block_device; @@ -114,7 +113,8 @@ struct btrfs_device { struct btrfs_fs_devices *fs_devices; struct btrfs_fs_info *fs_info; - struct rcu_string __rcu *name; + /* Device path or NULL if missing. */ + const char __rcu *name; u64 generation; @@ -422,6 +422,16 @@ struct btrfs_fs_devices { /* Count fs-devices opened. */ int opened; + /* + * Counter of the processes that are holding this fs_devices but not + * yet opened. + * This is for mounting handling, as we can only open the fs_devices + * after a super block is created. But we cannot take uuid_mutex + * during sget_fc(), thus we have to hold the fs_devices (meaning it + * cannot be released) until a super block is returned. + */ + int holding; + /* Set when we find or add a device that doesn't have the nonrot flag set. */ bool rotating; /* Devices support TRIM/discard commands. */ @@ -667,7 +677,7 @@ enum btrfs_map_op { BTRFS_MAP_GET_READ_MIRRORS, }; -static inline enum btrfs_map_op btrfs_op(struct bio *bio) +static inline enum btrfs_map_op btrfs_op(const struct bio *bio) { switch (bio_op(bio)) { case REQ_OP_WRITE: @@ -719,8 +729,7 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, blk_mode_t flags, void *holder); -struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, - bool mount_arg_dev); +struct btrfs_device *btrfs_scan_one_device(const char *path, bool mount_arg_dev); int btrfs_forget_devices(dev_t devt); void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices); @@ -754,7 +763,8 @@ void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf); int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); -int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset); +int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset, + bool verbose); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); @@ -846,7 +856,7 @@ static inline const char *btrfs_dev_name(const struct btrfs_device *device) if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) return "<missing disk>"; else - return rcu_str_deref(device->name); + return rcu_dereference(device->name); } static inline void btrfs_warn_unknown_chunk_allocation(enum btrfs_chunk_allocation_policy pol) @@ -854,6 +864,20 @@ static inline void btrfs_warn_unknown_chunk_allocation(enum btrfs_chunk_allocati WARN_ONCE(1, "unknown allocation policy %d, fallback to regular", pol); } +static inline void btrfs_fs_devices_inc_holding(struct btrfs_fs_devices *fs_devices) +{ + lockdep_assert_held(&uuid_mutex); + ASSERT(fs_devices->holding >= 0); + fs_devices->holding++; +} + +static inline void btrfs_fs_devices_dec_holding(struct btrfs_fs_devices *fs_devices) +{ + lockdep_assert_held(&uuid_mutex); + ASSERT(fs_devices->holding > 0); + fs_devices->holding--; +} + void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 3e0edbcf73e1..79fb1614bd0c 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -510,14 +510,15 @@ static int btrfs_initxattrs(struct inode *inode, */ nofs_flag = memalloc_nofs_save(); for (xattr = xattr_array; xattr->name != NULL; xattr++) { - name = kmalloc(XATTR_SECURITY_PREFIX_LEN + - strlen(xattr->name) + 1, GFP_KERNEL); + const size_t name_len = XATTR_SECURITY_PREFIX_LEN + + strlen(xattr->name) + 1; + + name = kmalloc(name_len, GFP_KERNEL); if (!name) { ret = -ENOMEM; break; } - strcpy(name, XATTR_SECURITY_PREFIX); - strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); + scnprintf(name, name_len, "%s%s", XATTR_SECURITY_PREFIX, xattr->name); if (strcmp(name, XATTR_NAME_CAPS) == 0) clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index b5b0156d5b95..245e813ecd78 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -9,7 +9,6 @@ #include "ctree.h" #include "volumes.h" #include "zoned.h" -#include "rcu-string.h" #include "disk-io.h" #include "block-group.h" #include "dev-replace.h" @@ -17,6 +16,7 @@ #include "fs.h" #include "accessors.h" #include "bio.h" +#include "transaction.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 @@ -263,9 +263,9 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, copy_zone_info_cb, zones); if (ret < 0) { - btrfs_err_in_rcu(device->fs_info, + btrfs_err(device->fs_info, "zoned: failed to read zone %llu on %s (devid %llu)", - pos, rcu_str_deref(device->name), + pos, rcu_dereference(device->name), device->devid); return ret; } @@ -395,16 +395,16 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) /* We reject devices with a zone size larger than 8GB */ if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) { - btrfs_err_in_rcu(fs_info, + btrfs_err(fs_info, "zoned: %s: zone size %llu larger than supported maximum %llu", - rcu_str_deref(device->name), + rcu_dereference(device->name), zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); ret = -EINVAL; goto out; } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) { - btrfs_err_in_rcu(fs_info, + btrfs_err(fs_info, "zoned: %s: zone size %llu smaller than supported minimum %u", - rcu_str_deref(device->name), + rcu_dereference(device->name), zone_info->zone_size, BTRFS_MIN_ZONE_SIZE); ret = -EINVAL; goto out; @@ -418,9 +418,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) max_active_zones = bdev_max_active_zones(bdev); if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { - btrfs_err_in_rcu(fs_info, + btrfs_err(fs_info, "zoned: %s: max active zones %u is too small, need at least %u active zones", - rcu_str_deref(device->name), max_active_zones, + rcu_dereference(device->name), max_active_zones, BTRFS_MIN_ACTIVE_ZONES); ret = -EINVAL; goto out; @@ -460,9 +460,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) zone_info->zone_cache = vcalloc(zone_info->nr_zones, sizeof(struct blk_zone)); if (!zone_info->zone_cache) { - btrfs_err_in_rcu(device->fs_info, + btrfs_err(device->fs_info, "zoned: failed to allocate zone cache for %s", - rcu_str_deref(device->name)); + rcu_dereference(device->name)); ret = -ENOMEM; goto out; } @@ -497,9 +497,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) } if (nreported != zone_info->nr_zones) { - btrfs_err_in_rcu(device->fs_info, + btrfs_err(device->fs_info, "inconsistent number of zones on %s (%u/%u)", - rcu_str_deref(device->name), nreported, + rcu_dereference(device->name), nreported, zone_info->nr_zones); ret = -EIO; goto out; @@ -507,9 +507,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) if (max_active_zones) { if (nactive > max_active_zones) { - btrfs_err_in_rcu(device->fs_info, + btrfs_err(device->fs_info, "zoned: %u active zones on %s exceeds max_active_zones %u", - nactive, rcu_str_deref(device->name), + nactive, rcu_dereference(device->name), max_active_zones); ret = -EIO; goto out; @@ -538,7 +538,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) goto out; if (nr_zones != BTRFS_NR_SB_LOG_ZONES) { - btrfs_err_in_rcu(device->fs_info, + btrfs_err(device->fs_info, "zoned: failed to read super block log zone info at devid %llu zone %u", device->devid, sb_zone); ret = -EUCLEAN; @@ -556,7 +556,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) ret = sb_write_pointer(device->bdev, &zone_info->sb_zones[sb_pos], &sb_wp); if (ret != -ENOENT && ret) { - btrfs_err_in_rcu(device->fs_info, + btrfs_err(device->fs_info, "zoned: super block log zone corrupted devid %llu zone %u", device->devid, sb_zone); ret = -EUCLEAN; @@ -575,9 +575,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) emulated = "emulated "; } - btrfs_info_in_rcu(fs_info, + btrfs_info(fs_info, "%s block device %s, %u %szones of %llu bytes", - model, rcu_str_deref(device->name), zone_info->nr_zones, + model, rcu_dereference(device->name), zone_info->nr_zones, emulated, zone_info->zone_size); return 0; @@ -1182,10 +1182,10 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) continue; /* Free regions should be empty */ - btrfs_warn_in_rcu( + btrfs_warn( device->fs_info, "zoned: resetting device %s (devid %llu) zone %llu for allocation", - rcu_str_deref(device->name), device->devid, pos >> shift); + rcu_dereference(device->name), device->devid, pos >> shift); WARN_ON_ONCE(1); ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size, @@ -1345,9 +1345,9 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, } if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { - btrfs_err_in_rcu(fs_info, + btrfs_err(fs_info, "zoned: unexpected conventional zone %llu on device %s (devid %llu)", - zone.start << SECTOR_SHIFT, rcu_str_deref(device->name), + zone.start << SECTOR_SHIFT, rcu_dereference(device->name), device->devid); up_read(&dev_replace->rwsem); return -EIO; @@ -1358,10 +1358,10 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, switch (zone.cond) { case BLK_ZONE_COND_OFFLINE: case BLK_ZONE_COND_READONLY: - btrfs_err_in_rcu(fs_info, + btrfs_err(fs_info, "zoned: offline/readonly zone %llu on device %s (devid %llu)", (info->physical >> device->zone_info->zone_size_shift), - rcu_str_deref(device->name), device->devid); + rcu_dereference(device->name), device->devid); info->alloc_offset = WP_MISSING_DEV; break; case BLK_ZONE_COND_EMPTY: @@ -1403,7 +1403,8 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg, static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; @@ -1426,6 +1427,13 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, zone_info[1].physical); return -EIO; } + + if (zone_info[0].alloc_offset == WP_CONVENTIONAL) + zone_info[0].alloc_offset = last_alloc; + + if (zone_info[1].alloc_offset == WP_CONVENTIONAL) + zone_info[1].alloc_offset = last_alloc; + if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) { btrfs_err(bg->fs_info, "zoned: write pointer offset mismatch of zones in DUP profile"); @@ -1446,7 +1454,8 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; int i; @@ -1461,10 +1470,12 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); for (i = 0; i < map->num_stripes; i++) { - if (zone_info[i].alloc_offset == WP_MISSING_DEV || - zone_info[i].alloc_offset == WP_CONVENTIONAL) + if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) + zone_info[i].alloc_offset = last_alloc; + if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && !btrfs_test_opt(fs_info, DEGRADED)) { btrfs_err(fs_info, @@ -1494,7 +1505,8 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; @@ -1505,10 +1517,29 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, } for (int i = 0; i < map->num_stripes; i++) { - if (zone_info[i].alloc_offset == WP_MISSING_DEV || - zone_info[i].alloc_offset == WP_CONVENTIONAL) + if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { + u64 stripe_nr, full_stripe_nr; + u64 stripe_offset; + int stripe_index; + + stripe_nr = div64_u64(last_alloc, map->stripe_size); + stripe_offset = stripe_nr * map->stripe_size; + full_stripe_nr = div_u64(stripe_nr, map->num_stripes); + div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); + + zone_info[i].alloc_offset = + full_stripe_nr * map->stripe_size; + + if (stripe_index > i) + zone_info[i].alloc_offset += map->stripe_size; + else if (stripe_index == i) + zone_info[i].alloc_offset += + (last_alloc - stripe_offset); + } + if (test_bit(0, active) != test_bit(i, active)) { if (!btrfs_zone_activate(bg)) return -EIO; @@ -1526,7 +1557,8 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; @@ -1537,8 +1569,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, } for (int i = 0; i < map->num_stripes; i++) { - if (zone_info[i].alloc_offset == WP_MISSING_DEV || - zone_info[i].alloc_offset == WP_CONVENTIONAL) + if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; if (test_bit(0, active) != test_bit(i, active)) { @@ -1549,6 +1580,29 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); } + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { + u64 stripe_nr, full_stripe_nr; + u64 stripe_offset; + int stripe_index; + + stripe_nr = div64_u64(last_alloc, map->stripe_size); + stripe_offset = stripe_nr * map->stripe_size; + full_stripe_nr = div_u64(stripe_nr, + map->num_stripes / map->sub_stripes); + div_u64_rem(stripe_nr, + (map->num_stripes / map->sub_stripes), + &stripe_index); + + zone_info[i].alloc_offset = + full_stripe_nr * map->stripe_size; + + if (stripe_index > (i / map->sub_stripes)) + zone_info[i].alloc_offset += map->stripe_size; + else if (stripe_index == (i / map->sub_stripes)) + zone_info[i].alloc_offset += + (last_alloc - stripe_offset); + } + if ((i % map->sub_stripes) == 0) { bg->zone_capacity += zone_info[i].capacity; bg->alloc_offset += zone_info[i].alloc_offset; @@ -1637,18 +1691,22 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) ret = btrfs_load_block_group_single(cache, &zone_info[0], active); break; case BTRFS_BLOCK_GROUP_DUP: - ret = btrfs_load_block_group_dup(cache, map, zone_info, active); + ret = btrfs_load_block_group_dup(cache, map, zone_info, active, + last_alloc); break; case BTRFS_BLOCK_GROUP_RAID1: case BTRFS_BLOCK_GROUP_RAID1C3: case BTRFS_BLOCK_GROUP_RAID1C4: - ret = btrfs_load_block_group_raid1(cache, map, zone_info, active); + ret = btrfs_load_block_group_raid1(cache, map, zone_info, + active, last_alloc); break; case BTRFS_BLOCK_GROUP_RAID0: - ret = btrfs_load_block_group_raid0(cache, map, zone_info, active); + ret = btrfs_load_block_group_raid0(cache, map, zone_info, + active, last_alloc); break; case BTRFS_BLOCK_GROUP_RAID10: - ret = btrfs_load_block_group_raid10(cache, map, zone_info, active); + ret = btrfs_load_block_group_raid10(cache, map, zone_info, + active, last_alloc); break; case BTRFS_BLOCK_GROUP_RAID5: case BTRFS_BLOCK_GROUP_RAID6: @@ -2427,7 +2485,7 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, /* For the work */ btrfs_get_block_group(bg); - atomic_inc(&eb->refs); + refcount_inc(&eb->refs); bg->last_eb = eb; INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn); queue_work(system_unbound_wq, &bg->zone_finish_work); @@ -2443,6 +2501,66 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) spin_unlock(&fs_info->relocation_bg_lock); } +void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; + struct btrfs_space_info *space_info = data_sinfo->sub_group[0]; + struct btrfs_trans_handle *trans; + struct btrfs_block_group *bg; + struct list_head *bg_list; + u64 alloc_flags; + bool initial = false; + bool did_chunk_alloc = false; + int index; + int ret; + + if (!btrfs_is_zoned(fs_info)) + return; + + if (fs_info->data_reloc_bg) + return; + + if (sb_rdonly(fs_info->sb)) + return; + + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); + alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags); + index = btrfs_bg_flags_to_raid_index(alloc_flags); + + bg_list = &data_sinfo->block_groups[index]; +again: + list_for_each_entry(bg, bg_list, list) { + if (bg->used > 0) + continue; + + if (!initial) { + initial = true; + continue; + } + + fs_info->data_reloc_bg = bg->start; + set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &bg->runtime_flags); + btrfs_zone_activate(bg); + + return; + } + + if (did_chunk_alloc) + return; + + trans = btrfs_join_transaction(fs_info->tree_root); + if (IS_ERR(trans)) + return; + + ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); + btrfs_end_transaction(trans); + if (ret == 1) { + did_chunk_alloc = true; + bg_list = &space_info->block_groups[index]; + goto again; + } +} + void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; @@ -2465,8 +2583,8 @@ bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; + u64 total = btrfs_super_total_bytes(fs_info->super_copy); u64 used = 0; - u64 total = 0; u64 factor; ASSERT(btrfs_is_zoned(fs_info)); @@ -2479,7 +2597,6 @@ bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) if (!device->bdev) continue; - total += device->disk_total_bytes; used += device->bytes_used; } mutex_unlock(&fs_devices->device_list_mutex); diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 9672bf4c3335..6e11533b8e14 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -88,6 +88,7 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, struct extent_buffer *eb); void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); +void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info); void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info); void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, @@ -241,6 +242,8 @@ static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } +static inline void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) { } + static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { } static inline bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 4a796a049b5a..ff0292615e1f 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -200,8 +200,7 @@ void zstd_init_workspace_manager(void) ws = zstd_alloc_workspace(ZSTD_BTRFS_MAX_LEVEL); if (IS_ERR(ws)) { - pr_warn( - "BTRFS: cannot preallocate zstd compression workspace\n"); + btrfs_warn(NULL, "cannot preallocate zstd compression workspace"); } else { set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map); list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]); diff --git a/fs/buffer.c b/fs/buffer.c index 8cf4a1dc481e..ead4dc85debd 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1122,15 +1122,9 @@ __getblk_slow(struct block_device *bdev, sector_t block, { bool blocking = gfpflags_allow_blocking(gfp); - /* Size must be multiple of hard sectorsize */ - if (unlikely(size & (bdev_logical_block_size(bdev)-1) || - (size < 512 || size > PAGE_SIZE))) { - printk(KERN_ERR "getblk(): invalid block size %d requested\n", - size); - printk(KERN_ERR "logical block size: %d\n", - bdev_logical_block_size(bdev)); - - dump_stack(); + if (WARN_ON_ONCE(!IS_ALIGNED(size, bdev_logical_block_size(bdev)))) { + printk(KERN_ERR "getblk(): block size %d not aligned to logical block size %d\n", + size, bdev_logical_block_size(bdev)); return NULL; } @@ -2271,9 +2265,8 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, } EXPORT_SYMBOL(block_write_begin); -int block_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +int block_write_end(loff_t pos, unsigned len, unsigned copied, + struct folio *folio) { size_t start = pos - folio_pos(folio); @@ -2304,15 +2297,15 @@ int block_write_end(struct file *file, struct address_space *mapping, } EXPORT_SYMBOL(block_write_end); -int generic_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +int generic_write_end(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; loff_t old_size = inode->i_size; bool i_size_changed = false; - copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata); + copied = block_write_end(pos, len, copied, folio); /* * No need to use i_size_read() here, the i_size cannot change under us @@ -2501,7 +2494,8 @@ out: } EXPORT_SYMBOL(generic_cont_expand_simple); -static int cont_expand_zero(struct file *file, struct address_space *mapping, +static int cont_expand_zero(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, loff_t *bytes) { struct inode *inode = mapping->host; @@ -2525,12 +2519,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, } len = PAGE_SIZE - zerofrom; - err = aops->write_begin(file, mapping, curpos, len, + err = aops->write_begin(iocb, mapping, curpos, len, &folio, &fsdata); if (err) goto out; folio_zero_range(folio, offset_in_folio(folio, curpos), len); - err = aops->write_end(file, mapping, curpos, len, len, + err = aops->write_end(iocb, mapping, curpos, len, len, folio, fsdata); if (err < 0) goto out; @@ -2558,12 +2552,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, } len = offset - zerofrom; - err = aops->write_begin(file, mapping, curpos, len, + err = aops->write_begin(iocb, mapping, curpos, len, &folio, &fsdata); if (err) goto out; folio_zero_range(folio, offset_in_folio(folio, curpos), len); - err = aops->write_end(file, mapping, curpos, len, len, + err = aops->write_end(iocb, mapping, curpos, len, len, folio, fsdata); if (err < 0) goto out; @@ -2578,17 +2572,16 @@ out: * For moronic filesystems that do not allow holes in file. * We may have to extend the file. */ -int cont_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata, - get_block_t *get_block, loff_t *bytes) +int cont_write_begin(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, struct folio **foliop, + void **fsdata, get_block_t *get_block, loff_t *bytes) { struct inode *inode = mapping->host; unsigned int blocksize = i_blocksize(inode); unsigned int zerofrom; int err; - err = cont_expand_zero(file, mapping, pos, bytes); + err = cont_expand_zero(iocb, mapping, pos, bytes); if (err) return err; @@ -2610,7 +2603,7 @@ EXPORT_SYMBOL(cont_write_begin); * holes and correct delalloc and unwritten extent mapping on filesystems that * support these features. * - * We are not allowed to take the i_mutex here so we have to play games to + * We are not allowed to take the i_rwsem here so we have to play games to * protect against truncate races as the page could now be beyond EOF. Because * truncate writes the inode size before removing pages, once we have the * page lock we can determine safely if the page is beyond EOF. If it is not diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 92058ae43488..3e0576d9db1d 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -63,7 +63,7 @@ static void cachefiles_read_complete(struct kiocb *iocb, long ret) ret = -ESTALE; } - ki->term_func(ki->term_func_priv, ret, ki->was_async); + ki->term_func(ki->term_func_priv, ret); } cachefiles_put_kiocb(ki); @@ -188,7 +188,7 @@ in_progress: presubmission_error: if (term_func) - term_func(term_func_priv, ret < 0 ? ret : skipped, false); + term_func(term_func_priv, ret < 0 ? ret : skipped); return ret; } @@ -271,7 +271,7 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret) atomic_long_sub(ki->b_writing, &object->volume->cache->b_writing); set_bit(FSCACHE_COOKIE_HAVE_DATA, &object->cookie->flags); if (ki->term_func) - ki->term_func(ki->term_func_priv, ret, ki->was_async); + ki->term_func(ki->term_func_priv, ret); cachefiles_put_kiocb(ki); } @@ -301,7 +301,7 @@ int __cachefiles_write(struct cachefiles_object *object, ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL); if (!ki) { if (term_func) - term_func(term_func_priv, -ENOMEM, false); + term_func(term_func_priv, -ENOMEM); return -ENOMEM; } @@ -347,8 +347,6 @@ int __cachefiles_write(struct cachefiles_object *object, default: ki->was_async = false; cachefiles_write_complete(&ki->iocb, ret); - if (ret > 0) - ret = 0; break; } @@ -366,7 +364,7 @@ static int cachefiles_write(struct netfs_cache_resources *cres, { if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) { if (term_func) - term_func(term_func_priv, -ENOBUFS, false); + term_func(term_func_priv, -ENOBUFS); trace_netfs_sreq(term_func_priv, netfs_sreq_trace_cache_nowrite); return -ENOBUFS; } @@ -665,7 +663,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq) pre = CACHEFILES_DIO_BLOCK_SIZE - off; if (pre >= len) { fscache_count_dio_misfit(); - netfs_write_subrequest_terminated(subreq, len, false); + netfs_write_subrequest_terminated(subreq, len); return; } subreq->transferred += pre; @@ -691,7 +689,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq) len -= post; if (len == 0) { fscache_count_dio_misfit(); - netfs_write_subrequest_terminated(subreq, post, false); + netfs_write_subrequest_terminated(subreq, post); return; } iov_iter_truncate(&subreq->io_iter, len); @@ -703,7 +701,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq) &start, &len, len, true); cachefiles_end_secure(cache, saved_cred); if (ret < 0) { - netfs_write_subrequest_terminated(subreq, ret, false); + netfs_write_subrequest_terminated(subreq, ret); return; } diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index aecfc5c37b49..91dfd0231877 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -388,10 +388,10 @@ try_again: } else { struct renamedata rd = { .old_mnt_idmap = &nop_mnt_idmap, - .old_dir = d_inode(dir), + .old_parent = dir, .old_dentry = rep, .new_mnt_idmap = &nop_mnt_idmap, - .new_dir = d_inode(cache->graveyard), + .new_parent = cache->graveyard, .new_dentry = grave, }; trace_cachefiles_rename(object, d_inode(rep)->i_ino, why); diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index d9bc67176128..a7ed86fa98bb 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -83,10 +83,8 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb, trace_cachefiles_ondemand_fd_write(object, file_inode(file), pos, len); ret = __cachefiles_write(object, file, pos, iter, NULL, NULL); - if (!ret) { - ret = len; + if (ret > 0) kiocb->ki_pos += ret; - } out: fput(file); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 29be367905a1..8b202d789e93 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -238,6 +238,7 @@ static void finish_netfs_read(struct ceph_osd_request *req) if (sparse && err > 0) err = ceph_sparse_ext_map_end(op); if (err < subreq->len && + subreq->rreq->origin != NETFS_UNBUFFERED_READ && subreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); if (IS_ENCRYPTED(inode) && err > 0) { @@ -281,7 +282,8 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) size_t len; int mode; - if (rreq->origin != NETFS_DIO_READ) + if (rreq->origin != NETFS_UNBUFFERED_READ && + rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); @@ -407,6 +409,15 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) struct page **pages; size_t page_off; + /* + * FIXME: io_iter.count needs to be corrected to aligned + * length. Otherwise, iov_iter_get_pages_alloc2() operates + * with the initial unaligned length value. As a result, + * ceph_msg_data_cursor_init() triggers BUG_ON() in the case + * if msg->sparse_read_total > msg->data_length. + */ + subreq->io_iter.count = len; + err = iov_iter_get_pages_alloc2(&subreq->io_iter, &pages, len, &page_off); if (err < 0) { doutc(cl, "%llx.%llx failed to allocate pages, %d\n", @@ -539,7 +550,7 @@ static void ceph_set_page_fscache(struct page *page) folio_start_private_2(page_folio(page)); /* [DEPRECATED] */ } -static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async) +static void ceph_fscache_write_terminated(void *priv, ssize_t error) { struct inode *inode = priv; @@ -1853,10 +1864,12 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned * We are only allowed to write into/dirty the page if the page is * clean, or already dirty within the same snap context. */ -static int ceph_write_begin(struct file *file, struct address_space *mapping, +static int ceph_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { + struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); int r; @@ -1874,10 +1887,12 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, * we don't do anything in here that simple_write_end doesn't do * except adjust dirty page accounting */ -static int ceph_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, +static int ceph_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, + unsigned len, unsigned copied, struct folio *folio, void *fsdata) { + struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ceph_client *cl = ceph_inode_to_client(inode); bool check_cap = false; @@ -2319,13 +2334,13 @@ static const struct vm_operations_struct ceph_vmops = { .page_mkwrite = ceph_page_mkwrite, }; -int ceph_mmap(struct file *file, struct vm_area_struct *vma) +int ceph_mmap_prepare(struct vm_area_desc *desc) { - struct address_space *mapping = file->f_mapping; + struct address_space *mapping = desc->file->f_mapping; if (!mapping->a_ops->read_folio) return -ENOEXEC; - vma->vm_ops = &ceph_vmops; + desc->vm_ops = &ceph_vmops; return 0; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index a8d8b56cf9d2..b1a8ff612c41 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4957,24 +4957,20 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, cl = ceph_inode_to_client(dir); spin_lock(&dentry->d_lock); if (ret && di->lease_session && di->lease_session->s_mds == mds) { + int len = dentry->d_name.len; doutc(cl, "%p mds%d seq %d\n", dentry, mds, (int)di->lease_seq); rel->dname_seq = cpu_to_le32(di->lease_seq); __ceph_mdsc_drop_dentry_lease(dentry); + memcpy(*p, dentry->d_name.name, len); spin_unlock(&dentry->d_lock); if (IS_ENCRYPTED(dir) && fscrypt_has_encryption_key(dir)) { - int ret2 = ceph_encode_encrypted_fname(dir, dentry, *p); - - if (ret2 < 0) - return ret2; - - rel->dname_len = cpu_to_le32(ret2); - *p += ret2; - } else { - rel->dname_len = cpu_to_le32(dentry->d_name.len); - memcpy(*p, dentry->d_name.name, dentry->d_name.len); - *p += dentry->d_name.len; + len = ceph_encode_encrypted_dname(dir, *p, len); + if (len < 0) + return len; } + rel->dname_len = cpu_to_le32(len); + *p += len; } else { spin_unlock(&dentry->d_lock); } diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c index 3b3c4d8d401e..cab722619207 100644 --- a/fs/ceph/crypto.c +++ b/fs/ceph/crypto.c @@ -215,35 +215,31 @@ static struct inode *parse_longname(const struct inode *parent, struct ceph_client *cl = ceph_inode_to_client(parent); struct inode *dir = NULL; struct ceph_vino vino = { .snap = CEPH_NOSNAP }; - char *inode_number; - char *name_end; - int orig_len = *name_len; + char *name_end, *inode_number; int ret = -EIO; - + /* NUL-terminate */ + char *str __free(kfree) = kmemdup_nul(name, *name_len, GFP_KERNEL); + if (!str) + return ERR_PTR(-ENOMEM); /* Skip initial '_' */ - name++; - name_end = strrchr(name, '_'); + str++; + name_end = strrchr(str, '_'); if (!name_end) { - doutc(cl, "failed to parse long snapshot name: %s\n", name); + doutc(cl, "failed to parse long snapshot name: %s\n", str); return ERR_PTR(-EIO); } - *name_len = (name_end - name); + *name_len = (name_end - str); if (*name_len <= 0) { pr_err_client(cl, "failed to parse long snapshot name\n"); return ERR_PTR(-EIO); } /* Get the inode number */ - inode_number = kmemdup_nul(name_end + 1, - orig_len - *name_len - 2, - GFP_KERNEL); - if (!inode_number) - return ERR_PTR(-ENOMEM); + inode_number = name_end + 1; ret = kstrtou64(inode_number, 10, &vino.ino); if (ret) { - doutc(cl, "failed to parse inode number: %s\n", name); - dir = ERR_PTR(ret); - goto out; + doutc(cl, "failed to parse inode number: %s\n", str); + return ERR_PTR(ret); } /* And finally the inode */ @@ -254,42 +250,29 @@ static struct inode *parse_longname(const struct inode *parent, if (IS_ERR(dir)) doutc(cl, "can't find inode %s (%s)\n", inode_number, name); } - -out: - kfree(inode_number); return dir; } -int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name, - char *buf) +int ceph_encode_encrypted_dname(struct inode *parent, char *buf, int elen) { struct ceph_client *cl = ceph_inode_to_client(parent); struct inode *dir = parent; - struct qstr iname; + char *p = buf; u32 len; - int name_len; - int elen; + int name_len = elen; int ret; u8 *cryptbuf = NULL; - iname.name = d_name->name; - name_len = d_name->len; - /* Handle the special case of snapshot names that start with '_' */ - if ((ceph_snap(dir) == CEPH_SNAPDIR) && (name_len > 0) && - (iname.name[0] == '_')) { - dir = parse_longname(parent, iname.name, &name_len); + if (ceph_snap(dir) == CEPH_SNAPDIR && *p == '_') { + dir = parse_longname(parent, p, &name_len); if (IS_ERR(dir)) return PTR_ERR(dir); - iname.name++; /* skip initial '_' */ + p++; /* skip initial '_' */ } - iname.len = name_len; - if (!fscrypt_has_encryption_key(dir)) { - memcpy(buf, d_name->name, d_name->len); - elen = d_name->len; + if (!fscrypt_has_encryption_key(dir)) goto out; - } /* * Convert cleartext d_name to ciphertext. If result is longer than @@ -297,7 +280,7 @@ int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name, * * See: fscrypt_setup_filename */ - if (!fscrypt_fname_encrypted_size(dir, iname.len, NAME_MAX, &len)) { + if (!fscrypt_fname_encrypted_size(dir, name_len, NAME_MAX, &len)) { elen = -ENAMETOOLONG; goto out; } @@ -310,7 +293,9 @@ int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name, goto out; } - ret = fscrypt_fname_encrypt(dir, &iname, cryptbuf, len); + ret = fscrypt_fname_encrypt(dir, + &(struct qstr)QSTR_INIT(p, name_len), + cryptbuf, len); if (ret) { elen = ret; goto out; @@ -331,18 +316,13 @@ int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name, } /* base64 encode the encrypted name */ - elen = ceph_base64_encode(cryptbuf, len, buf); - doutc(cl, "base64-encoded ciphertext name = %.*s\n", elen, buf); + elen = ceph_base64_encode(cryptbuf, len, p); + doutc(cl, "base64-encoded ciphertext name = %.*s\n", elen, p); /* To understand the 240 limit, see CEPH_NOHASH_NAME_MAX comments */ WARN_ON(elen > 240); - if ((elen > 0) && (dir != parent)) { - char tmp_buf[NAME_MAX]; - - elen = snprintf(tmp_buf, sizeof(tmp_buf), "_%.*s_%ld", - elen, buf, dir->i_ino); - memcpy(buf, tmp_buf, elen); - } + if (dir != parent) // leading _ is already there; append _<inum> + elen += 1 + sprintf(p + elen, "_%ld", dir->i_ino); out: kfree(cryptbuf); @@ -355,14 +335,6 @@ out: return elen; } -int ceph_encode_encrypted_fname(struct inode *parent, struct dentry *dentry, - char *buf) -{ - WARN_ON_ONCE(!fscrypt_has_encryption_key(parent)); - - return ceph_encode_encrypted_dname(parent, &dentry->d_name, buf); -} - /** * ceph_fname_to_usr - convert a filename for userland presentation * @fname: ceph_fname to be converted @@ -516,15 +488,13 @@ int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode, int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, - unsigned int offs, u64 lblk_num, - gfp_t gfp_flags) + unsigned int offs, u64 lblk_num) { struct ceph_client *cl = ceph_inode_to_client(inode); doutc(cl, "%p %llx.%llx len %u offs %u blk %llu\n", inode, ceph_vinop(inode), len, offs, lblk_num); - return fscrypt_encrypt_block_inplace(inode, page, len, offs, lblk_num, - gfp_flags); + return fscrypt_encrypt_block_inplace(inode, page, len, offs, lblk_num); } /** @@ -642,9 +612,8 @@ int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page, * @page: pointer to page array * @off: offset into the file that the data starts * @len: max length to encrypt - * @gfp: gfp flags to use for allocation * - * Decrypt an array of cleartext pages and return the amount of + * Encrypt an array of cleartext pages and return the amount of * data encrypted. Any data in the page prior to the start of the * first complete block in the read is ignored. Any incomplete * crypto blocks at the end of the array are ignored. @@ -652,7 +621,7 @@ int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page, * Returns the length of the encrypted data or a negative errno. */ int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off, - int len, gfp_t gfp) + int len) { int i, num_blocks; u64 baseblk = off >> CEPH_FSCRYPT_BLOCK_SHIFT; @@ -673,7 +642,7 @@ int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off, fret = ceph_fscrypt_encrypt_block_inplace(inode, page[pgidx], CEPH_FSCRYPT_BLOCK_SIZE, pgoffs, - baseblk + i, gfp); + baseblk + i); if (fret < 0) { if (ret == 0) ret = fret; diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h index d0768239a1c9..23612b2e9837 100644 --- a/fs/ceph/crypto.h +++ b/fs/ceph/crypto.h @@ -102,10 +102,7 @@ int ceph_fscrypt_prepare_context(struct inode *dir, struct inode *inode, struct ceph_acl_sec_ctx *as); void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req, struct ceph_acl_sec_ctx *as); -int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name, - char *buf); -int ceph_encode_encrypted_fname(struct inode *parent, struct dentry *dentry, - char *buf); +int ceph_encode_encrypted_dname(struct inode *parent, char *buf, int len); static inline int ceph_fname_alloc_buffer(struct inode *parent, struct fscrypt_str *fname) @@ -155,15 +152,14 @@ int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode, unsigned int offs, u64 lblk_num); int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, - unsigned int offs, u64 lblk_num, - gfp_t gfp_flags); + unsigned int offs, u64 lblk_num); int ceph_fscrypt_decrypt_pages(struct inode *inode, struct page **page, u64 off, int len); int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page, u64 off, struct ceph_sparse_extent *map, u32 ext_cnt); int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off, - int len, gfp_t gfp); + int len); static inline struct page *ceph_fscrypt_pagecache_page(struct page *page) { @@ -194,17 +190,10 @@ static inline void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req, { } -static inline int ceph_encode_encrypted_dname(struct inode *parent, - struct qstr *d_name, char *buf) -{ - memcpy(buf, d_name->name, d_name->len); - return d_name->len; -} - -static inline int ceph_encode_encrypted_fname(struct inode *parent, - struct dentry *dentry, char *buf) +static inline int ceph_encode_encrypted_dname(struct inode *parent, char *buf, + int len) { - return -EOPNOTSUPP; + return len; } static inline int ceph_fname_alloc_buffer(struct inode *parent, @@ -246,8 +235,7 @@ static inline int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode, static inline int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, - unsigned int offs, u64 lblk_num, - gfp_t gfp_flags) + unsigned int offs, u64 lblk_num) { return 0; } @@ -269,7 +257,7 @@ static inline int ceph_fscrypt_decrypt_extents(struct inode *inode, static inline int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off, - int len, gfp_t gfp) + int len) { return 0; } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index a321aa6d0ed2..8478e7e75df6 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -423,17 +423,16 @@ more: req->r_inode_drop = CEPH_CAP_FILE_EXCL; } if (dfi->last_name) { - struct qstr d_name = { .name = dfi->last_name, - .len = strlen(dfi->last_name) }; + int len = strlen(dfi->last_name); req->r_path2 = kzalloc(NAME_MAX + 1, GFP_KERNEL); if (!req->r_path2) { ceph_mdsc_put_request(req); return -ENOMEM; } + memcpy(req->r_path2, dfi->last_name, len); - err = ceph_encode_encrypted_dname(inode, &d_name, - req->r_path2); + err = ceph_encode_encrypted_dname(inode, req->r_path2, len); if (err < 0) { ceph_mdsc_put_request(req); return err; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 150076ced937..b2f2af104679 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -33,12 +33,19 @@ struct ceph_nfs_snapfh { u32 hash; } __attribute__ ((packed)); +#define BYTES_PER_U32 (sizeof(u32)) +#define CEPH_FH_BASIC_SIZE \ + (sizeof(struct ceph_nfs_fh) / BYTES_PER_U32) +#define CEPH_FH_WITH_PARENT_SIZE \ + (sizeof(struct ceph_nfs_confh) / BYTES_PER_U32) +#define CEPH_FH_SNAPPED_INODE_SIZE \ + (sizeof(struct ceph_nfs_snapfh) / BYTES_PER_U32) + static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len, struct inode *parent_inode) { struct ceph_client *cl = ceph_inode_to_client(inode); - static const int snap_handle_length = - sizeof(struct ceph_nfs_snapfh) >> 2; + static const int snap_handle_length = CEPH_FH_SNAPPED_INODE_SIZE; struct ceph_nfs_snapfh *sfh = (void *)rawfh; u64 snapid = ceph_snap(inode); int ret; @@ -88,10 +95,8 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, struct inode *parent_inode) { struct ceph_client *cl = ceph_inode_to_client(inode); - static const int handle_length = - sizeof(struct ceph_nfs_fh) >> 2; - static const int connected_handle_length = - sizeof(struct ceph_nfs_confh) >> 2; + static const int handle_length = CEPH_FH_BASIC_SIZE; + static const int connected_handle_length = CEPH_FH_WITH_PARENT_SIZE; int type; if (ceph_snap(inode) != CEPH_NOSNAP) @@ -308,7 +313,7 @@ static struct dentry *ceph_fh_to_dentry(struct super_block *sb, if (fh_type != FILEID_INO32_GEN && fh_type != FILEID_INO32_GEN_PARENT) return NULL; - if (fh_len < sizeof(*fh) / 4) + if (fh_len < sizeof(*fh) / BYTES_PER_U32) return NULL; doutc(fsc->client, "%llx\n", fh->ino); @@ -427,7 +432,7 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb, if (fh_type != FILEID_INO32_GEN_PARENT) return NULL; - if (fh_len < sizeof(*cfh) / 4) + if (fh_len < sizeof(*cfh) / BYTES_PER_U32) return NULL; doutc(fsc->client, "%llx\n", cfh->parent_ino); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 851d70200c6b..c02f100f8552 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1992,8 +1992,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, if (IS_ENCRYPTED(inode)) { ret = ceph_fscrypt_encrypt_pages(inode, pages, - write_pos, write_len, - GFP_KERNEL); + write_pos, write_len); if (ret < 0) { doutc(cl, "encryption failed with %d\n", ret); ceph_release_page_vector(pages, num_pages); @@ -2530,19 +2529,19 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) return generic_file_llseek(file, offset, whence); } -static inline void ceph_zero_partial_page( - struct inode *inode, loff_t offset, unsigned size) +static inline void ceph_zero_partial_page(struct inode *inode, + loff_t offset, size_t size) { - struct page *page; - pgoff_t index = offset >> PAGE_SHIFT; + struct folio *folio; - page = find_lock_page(inode->i_mapping, index); - if (page) { - wait_on_page_writeback(page); - zero_user(page, offset & (PAGE_SIZE - 1), size); - unlock_page(page); - put_page(page); - } + folio = filemap_lock_folio(inode->i_mapping, offset >> PAGE_SHIFT); + if (IS_ERR(folio)) + return; + + folio_wait_writeback(folio); + folio_zero_range(folio, offset_in_folio(folio, offset), size); + folio_unlock(folio); + folio_put(folio); } static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset, @@ -2616,7 +2615,7 @@ static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length) s32 stripe_unit = ci->i_layout.stripe_unit; s32 stripe_count = ci->i_layout.stripe_count; s32 object_size = ci->i_layout.object_size; - u64 object_set_size = object_size * stripe_count; + u64 object_set_size = (u64) object_size * stripe_count; u64 nearly, t; /* round offset up to next period boundary */ @@ -3171,7 +3170,7 @@ const struct file_operations ceph_file_fops = { .llseek = ceph_llseek, .read_iter = ceph_read_iter, .write_iter = ceph_write_iter, - .mmap = ceph_mmap, + .mmap_prepare = ceph_mmap_prepare, .fsync = ceph_fsync, .lock = ceph_lock, .setlease = simple_nosetlease, diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 06cd2963e41e..fc543075b827 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2436,8 +2436,7 @@ static int fill_fscrypt_truncate(struct inode *inode, /* encrypt the last block */ ret = ceph_fscrypt_encrypt_block_inplace(inode, page, CEPH_FSCRYPT_BLOCK_SIZE, - 0, block, - GFP_KERNEL); + 0, block); if (ret) goto out; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 230e0c3f341f..0f497c39ff82 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2766,8 +2766,8 @@ retry: } if (fscrypt_has_encryption_key(d_inode(parent))) { - len = ceph_encode_encrypted_fname(d_inode(parent), - cur, buf); + len = ceph_encode_encrypted_dname(d_inode(parent), + buf, len); if (len < 0) { dput(parent); dput(cur); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index f3951253e393..c3eb651862c5 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1033,8 +1033,7 @@ void ceph_umount_begin(struct super_block *sb) struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); doutc(fsc->client, "starting forced umount\n"); - if (!fsc) - return; + fsc->mount_state = CEPH_MOUNT_SHUTDOWN; __ceph_umount_begin(fsc); } @@ -1220,13 +1219,14 @@ static int ceph_set_super(struct super_block *s, struct fs_context *fc) fsc->max_file_size = 1ULL << 40; /* temp value until we get mdsmap */ s->s_op = &ceph_super_ops; - s->s_d_op = &ceph_dentry_ops; + set_default_d_op(s, &ceph_dentry_ops); s->s_export_op = &ceph_export_ops; s->s_time_gran = 1; s->s_time_min = 0; s->s_time_max = U32_MAX; s->s_flags |= SB_NODIRATIME | SB_NOATIME; + s->s_magic = CEPH_SUPER_MAGIC; ceph_fscrypt_set_ops(s); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index bb0db0cc8003..cf176aab0f82 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1286,7 +1286,7 @@ extern void __ceph_touch_fmode(struct ceph_inode_info *ci, /* addr.c */ extern const struct address_space_operations ceph_aops; extern const struct netfs_request_ops ceph_netfs_ops; -extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); +int ceph_mmap_prepare(struct vm_area_desc *desc); extern int ceph_uninline_data(struct file *file); extern int ceph_pool_perm_check(struct inode *inode, int need); extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); diff --git a/fs/coda/dir.c b/fs/coda/dir.c index ab69d8f0cec2..ca9990017265 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -429,17 +429,9 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx) cfi = coda_ftoc(coda_file); host_file = cfi->cfi_container; - if (host_file->f_op->iterate_shared) { - struct inode *host_inode = file_inode(host_file); - ret = -ENOENT; - if (!IS_DEADDIR(host_inode)) { - inode_lock_shared(host_inode); - ret = host_file->f_op->iterate_shared(host_file, ctx); - file_accessed(host_file); - inode_unlock_shared(host_inode); - } + ret = iterate_dir(host_file, ctx); + if (ret != -ENOTDIR) return ret; - } /* Venus: we must read Venus dirents from a file */ return coda_venus_readdir(coda_file, ctx); } diff --git a/fs/coda/file.c b/fs/coda/file.c index 148856a582a9..a390b5d21196 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -160,7 +160,7 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma) size_t count; int ret; - if (!host_file->f_op->mmap) + if (!can_mmap_file(host_file)) return -ENODEV; if (WARN_ON(coda_file != vma->vm_file)) @@ -199,10 +199,10 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma) spin_unlock(&cii->c_lock); vma->vm_file = get_file(host_file); - ret = call_mmap(vma->vm_file, vma); + ret = vfs_mmap(vma->vm_file, vma); if (ret) { - /* if call_mmap fails, our caller will put host_file so we + /* if vfs_mmap fails, our caller will put host_file so we * should drop the reference to the coda_file that we got. */ fput(coda_file); diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 6896fce122e1..08450d006016 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -230,7 +230,7 @@ static int coda_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_blocksize_bits = 12; sb->s_magic = CODA_SUPER_MAGIC; sb->s_op = &coda_super_operations; - sb->s_d_op = &coda_dentry_operations; + set_default_d_op(sb, &coda_dentry_operations); sb->s_time_gran = 1; sb->s_time_min = S64_MIN; sb->s_time_max = S64_MAX; diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig index 272b64456999..1fcd761fe7be 100644 --- a/fs/configfs/Kconfig +++ b/fs/configfs/Kconfig @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only config CONFIGFS_FS tristate "Userspace-driven configuration filesystem" - select SYSFS help configfs is a RAM-based filesystem that provides the converse of sysfs's functionality. Where sysfs is a filesystem-based diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index ebf32822e29b..f327fbb9a0ca 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -67,7 +67,6 @@ static void configfs_d_iput(struct dentry * dentry, const struct dentry_operations configfs_dentry_ops = { .d_iput = configfs_d_iput, - .d_delete = always_delete_dentry, }; #ifdef CONFIG_LOCKDEP diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index c2d820063ec4..740f18b60c9d 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -92,7 +92,8 @@ static int configfs_fill_super(struct super_block *sb, struct fs_context *fc) configfs_root_group.cg_item.ci_dentry = root; root->d_fsdata = &configfs_root; sb->s_root = root; - sb->s_d_op = &configfs_dentry_ops; /* the rest get that */ + set_default_d_op(sb, &configfs_dentry_ops); /* the rest get that */ + sb->s_d_flags |= DCACHE_DONTCACHE; return 0; } diff --git a/fs/coredump.c b/fs/coredump.c index f217ebf2b3b6..fedbead956ed 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -51,6 +51,7 @@ #include <net/sock.h> #include <uapi/linux/pidfd.h> #include <uapi/linux/un.h> +#include <uapi/linux/coredump.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -81,17 +82,22 @@ static unsigned int core_sort_vma; static char core_pattern[CORENAME_MAX_SIZE] = "core"; static int core_name_size = CORENAME_MAX_SIZE; unsigned int core_file_note_size_limit = CORE_FILE_NOTE_SIZE_DEFAULT; +static atomic_t core_pipe_count = ATOMIC_INIT(0); enum coredump_type_t { - COREDUMP_FILE = 1, - COREDUMP_PIPE = 2, - COREDUMP_SOCK = 3, + COREDUMP_FILE = 1, + COREDUMP_PIPE = 2, + COREDUMP_SOCK = 3, + COREDUMP_SOCK_REQ = 4, }; struct core_name { char *corename; int used, size; + unsigned int core_pipe_limit; + bool core_dumped; enum coredump_type_t core_type; + u64 mask; }; static int expand_corename(struct core_name *cn, int size) @@ -222,11 +228,12 @@ put_exe_file: return ret; } -/* format_corename will inspect the pattern parameter, and output a - * name into corename, which must have space for at least - * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. +/* + * coredump_parse will inspect the pattern parameter, and output a name + * into corename, which must have space for at least CORENAME_MAX_SIZE + * bytes plus one byte for the zero terminator. */ -static int format_corename(struct core_name *cn, struct coredump_params *cprm, +static bool coredump_parse(struct core_name *cn, struct coredump_params *cprm, size_t **argv, int *argc) { const struct cred *cred = current_cred(); @@ -235,8 +242,13 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, int pid_in_pattern = 0; int err = 0; + cn->mask = COREDUMP_KERNEL; + if (core_pipe_limit) + cn->mask |= COREDUMP_WAIT; cn->used = 0; cn->corename = NULL; + cn->core_pipe_limit = 0; + cn->core_dumped = false; if (*pat_ptr == '|') cn->core_type = COREDUMP_PIPE; else if (*pat_ptr == '@') @@ -244,7 +256,7 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, else cn->core_type = COREDUMP_FILE; if (expand_corename(cn, core_name_size)) - return -ENOMEM; + return false; cn->corename[0] = '\0'; switch (cn->core_type) { @@ -252,26 +264,33 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, int argvs = sizeof(core_pattern) / 2; (*argv) = kmalloc_array(argvs, sizeof(**argv), GFP_KERNEL); if (!(*argv)) - return -ENOMEM; + return false; (*argv)[(*argc)++] = 0; ++pat_ptr; if (!(*pat_ptr)) - return -ENOMEM; + return false; break; } case COREDUMP_SOCK: { /* skip the @ */ pat_ptr++; if (!(*pat_ptr)) - return -ENOMEM; + return false; + if (*pat_ptr == '@') { + pat_ptr++; + if (!(*pat_ptr)) + return false; + + cn->core_type = COREDUMP_SOCK_REQ; + } err = cn_printf(cn, "%s", pat_ptr); if (err) - return err; + return false; /* Require absolute paths. */ if (cn->corename[0] != '/') - return -EINVAL; + return false; /* * Ensure we can uses spaces to indicate additional @@ -279,7 +298,18 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, */ if (strchr(cn->corename, ' ')) { coredump_report_failure("Coredump socket may not %s contain spaces", cn->corename); - return -EINVAL; + return false; + } + + /* Must not contain ".." in the path. */ + if (name_contains_dotdot(cn->corename)) { + coredump_report_failure("Coredump socket may not %s contain '..' spaces", cn->corename); + return false; + } + + if (strlen(cn->corename) >= UNIX_PATH_MAX) { + coredump_report_failure("Coredump socket path %s too long", cn->corename); + return false; } /* @@ -289,13 +319,13 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, * via /proc/<pid>, using the SO_PEERPIDFD to guard * against pid recycling when opening /proc/<pid>. */ - return 0; + return true; } case COREDUMP_FILE: break; default: WARN_ON_ONCE(true); - return -EINVAL; + return false; } /* Repeat as long as we have more pattern to process and more output @@ -433,7 +463,7 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, } if (err) - return err; + return false; } out: @@ -443,9 +473,9 @@ out: * and core_uses_pid is set, then .%pid will be appended to * the filename. Do not do this for piped commands. */ if (cn->core_type == COREDUMP_FILE && !pid_in_pattern && core_uses_pid) - return cn_printf(cn, ".%d", task_tgid_vnr(current)); + return cn_printf(cn, ".%d", task_tgid_vnr(current)) == 0; - return 0; + return true; } static int zap_process(struct signal_struct *signal, int exit_code) @@ -632,21 +662,440 @@ static int umh_coredump_setup(struct subprocess_info *info, struct cred *new) return 0; } -void do_coredump(const kernel_siginfo_t *siginfo) +#ifdef CONFIG_UNIX +static bool coredump_sock_connect(struct core_name *cn, struct coredump_params *cprm) +{ + struct file *file __free(fput) = NULL; + struct sockaddr_un addr = { + .sun_family = AF_UNIX, + }; + ssize_t addr_len; + int retval; + struct socket *socket; + + addr_len = strscpy(addr.sun_path, cn->corename); + if (addr_len < 0) + return false; + addr_len += offsetof(struct sockaddr_un, sun_path) + 1; + + /* + * It is possible that the userspace process which is supposed + * to handle the coredump and is listening on the AF_UNIX socket + * coredumps. Userspace should just mark itself non dumpable. + */ + + retval = sock_create_kern(&init_net, AF_UNIX, SOCK_STREAM, 0, &socket); + if (retval < 0) + return false; + + file = sock_alloc_file(socket, 0, NULL); + if (IS_ERR(file)) + return false; + + /* + * Set the thread-group leader pid which is used for the peer + * credentials during connect() below. Then immediately register + * it in pidfs... + */ + cprm->pid = task_tgid(current); + retval = pidfs_register_pid(cprm->pid); + if (retval) + return false; + + /* + * ... and set the coredump information so userspace has it + * available after connect()... + */ + pidfs_coredump(cprm); + + retval = kernel_connect(socket, (struct sockaddr *)(&addr), addr_len, + O_NONBLOCK | SOCK_COREDUMP); + + if (retval) { + if (retval == -EAGAIN) + coredump_report_failure("Coredump socket %s receive queue full", addr.sun_path); + else + coredump_report_failure("Coredump socket connection %s failed %d", addr.sun_path, retval); + return false; + } + + /* ... and validate that @sk_peer_pid matches @cprm.pid. */ + if (WARN_ON_ONCE(unix_peer(socket->sk)->sk_peer_pid != cprm->pid)) + return false; + + cprm->limit = RLIM_INFINITY; + cprm->file = no_free_ptr(file); + + return true; +} + +static inline bool coredump_sock_recv(struct file *file, struct coredump_ack *ack, size_t size, int flags) +{ + struct msghdr msg = {}; + struct kvec iov = { .iov_base = ack, .iov_len = size }; + ssize_t ret; + + memset(ack, 0, size); + ret = kernel_recvmsg(sock_from_file(file), &msg, &iov, 1, size, flags); + return ret == size; +} + +static inline bool coredump_sock_send(struct file *file, struct coredump_req *req) +{ + struct msghdr msg = { .msg_flags = MSG_NOSIGNAL }; + struct kvec iov = { .iov_base = req, .iov_len = sizeof(*req) }; + ssize_t ret; + + ret = kernel_sendmsg(sock_from_file(file), &msg, &iov, 1, sizeof(*req)); + return ret == sizeof(*req); +} + +static_assert(sizeof(enum coredump_mark) == sizeof(__u32)); + +static inline bool coredump_sock_mark(struct file *file, enum coredump_mark mark) +{ + struct msghdr msg = { .msg_flags = MSG_NOSIGNAL }; + struct kvec iov = { .iov_base = &mark, .iov_len = sizeof(mark) }; + ssize_t ret; + + ret = kernel_sendmsg(sock_from_file(file), &msg, &iov, 1, sizeof(mark)); + return ret == sizeof(mark); +} + +static inline void coredump_sock_wait(struct file *file) +{ + ssize_t n; + + /* + * We use a simple read to wait for the coredump processing to + * finish. Either the socket is closed or we get sent unexpected + * data. In both cases, we're done. + */ + n = __kernel_read(file, &(char){ 0 }, 1, NULL); + if (n > 0) + coredump_report_failure("Coredump socket had unexpected data"); + else if (n < 0) + coredump_report_failure("Coredump socket failed"); +} + +static inline void coredump_sock_shutdown(struct file *file) +{ + struct socket *socket; + + socket = sock_from_file(file); + if (!socket) + return; + + /* Let userspace know we're done processing the coredump. */ + kernel_sock_shutdown(socket, SHUT_WR); +} + +static bool coredump_sock_request(struct core_name *cn, struct coredump_params *cprm) +{ + struct coredump_req req = { + .size = sizeof(struct coredump_req), + .mask = COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT, + .size_ack = sizeof(struct coredump_ack), + }; + struct coredump_ack ack = {}; + ssize_t usize; + + if (cn->core_type != COREDUMP_SOCK_REQ) + return true; + + /* Let userspace know what we support. */ + if (!coredump_sock_send(cprm->file, &req)) + return false; + + /* Peek the size of the coredump_ack. */ + if (!coredump_sock_recv(cprm->file, &ack, sizeof(ack.size), + MSG_PEEK | MSG_WAITALL)) + return false; + + /* Refuse unknown coredump_ack sizes. */ + usize = ack.size; + if (usize < COREDUMP_ACK_SIZE_VER0) { + coredump_sock_mark(cprm->file, COREDUMP_MARK_MINSIZE); + return false; + } + + if (usize > sizeof(ack)) { + coredump_sock_mark(cprm->file, COREDUMP_MARK_MAXSIZE); + return false; + } + + /* Now retrieve the coredump_ack. */ + if (!coredump_sock_recv(cprm->file, &ack, usize, MSG_WAITALL)) + return false; + if (ack.size != usize) + return false; + + /* Refuse unknown coredump_ack flags. */ + if (ack.mask & ~req.mask) { + coredump_sock_mark(cprm->file, COREDUMP_MARK_UNSUPPORTED); + return false; + } + + /* Refuse mutually exclusive options. */ + if (hweight64(ack.mask & (COREDUMP_USERSPACE | COREDUMP_KERNEL | + COREDUMP_REJECT)) != 1) { + coredump_sock_mark(cprm->file, COREDUMP_MARK_CONFLICTING); + return false; + } + + if (ack.spare) { + coredump_sock_mark(cprm->file, COREDUMP_MARK_UNSUPPORTED); + return false; + } + + cn->mask = ack.mask; + return coredump_sock_mark(cprm->file, COREDUMP_MARK_REQACK); +} + +static bool coredump_socket(struct core_name *cn, struct coredump_params *cprm) +{ + if (!coredump_sock_connect(cn, cprm)) + return false; + + return coredump_sock_request(cn, cprm); +} +#else +static inline void coredump_sock_wait(struct file *file) { } +static inline void coredump_sock_shutdown(struct file *file) { } +static inline bool coredump_socket(struct core_name *cn, struct coredump_params *cprm) { return false; } +#endif + +/* cprm->mm_flags contains a stable snapshot of dumpability flags. */ +static inline bool coredump_force_suid_safe(const struct coredump_params *cprm) +{ + /* Require nonrelative corefile path and be extra careful. */ + return __get_dumpable(cprm->mm_flags) == SUID_DUMP_ROOT; +} + +static bool coredump_file(struct core_name *cn, struct coredump_params *cprm, + const struct linux_binfmt *binfmt) +{ + struct mnt_idmap *idmap; + struct inode *inode; + struct file *file __free(fput) = NULL; + int open_flags = O_CREAT | O_WRONLY | O_NOFOLLOW | O_LARGEFILE | O_EXCL; + + if (cprm->limit < binfmt->min_coredump) + return false; + + if (coredump_force_suid_safe(cprm) && cn->corename[0] != '/') { + coredump_report_failure("this process can only dump core to a fully qualified path, skipping core dump"); + return false; + } + + /* + * Unlink the file if it exists unless this is a SUID + * binary - in that case, we're running around with root + * privs and don't want to unlink another user's coredump. + */ + if (!coredump_force_suid_safe(cprm)) { + /* + * If it doesn't exist, that's fine. If there's some + * other problem, we'll catch it at the filp_open(). + */ + do_unlinkat(AT_FDCWD, getname_kernel(cn->corename)); + } + + /* + * There is a race between unlinking and creating the + * file, but if that causes an EEXIST here, that's + * fine - another process raced with us while creating + * the corefile, and the other process won. To userspace, + * what matters is that at least one of the two processes + * writes its coredump successfully, not which one. + */ + if (coredump_force_suid_safe(cprm)) { + /* + * Using user namespaces, normal user tasks can change + * their current->fs->root to point to arbitrary + * directories. Since the intention of the "only dump + * with a fully qualified path" rule is to control where + * coredumps may be placed using root privileges, + * current->fs->root must not be used. Instead, use the + * root directory of init_task. + */ + struct path root; + + task_lock(&init_task); + get_fs_root(init_task.fs, &root); + task_unlock(&init_task); + file = file_open_root(&root, cn->corename, open_flags, 0600); + path_put(&root); + } else { + file = filp_open(cn->corename, open_flags, 0600); + } + if (IS_ERR(file)) + return false; + + inode = file_inode(file); + if (inode->i_nlink > 1) + return false; + if (d_unhashed(file->f_path.dentry)) + return false; + /* + * AK: actually i see no reason to not allow this for named + * pipes etc, but keep the previous behaviour for now. + */ + if (!S_ISREG(inode->i_mode)) + return false; + /* + * Don't dump core if the filesystem changed owner or mode + * of the file during file creation. This is an issue when + * a process dumps core while its cwd is e.g. on a vfat + * filesystem. + */ + idmap = file_mnt_idmap(file); + if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) { + coredump_report_failure("Core dump to %s aborted: cannot preserve file owner", cn->corename); + return false; + } + if ((inode->i_mode & 0677) != 0600) { + coredump_report_failure("Core dump to %s aborted: cannot preserve file permissions", cn->corename); + return false; + } + if (!(file->f_mode & FMODE_CAN_WRITE)) + return false; + if (do_truncate(idmap, file->f_path.dentry, 0, 0, file)) + return false; + + cprm->file = no_free_ptr(file); + return true; +} + +static bool coredump_pipe(struct core_name *cn, struct coredump_params *cprm, + size_t *argv, int argc) { + int argi; + char **helper_argv __free(kfree) = NULL; + struct subprocess_info *sub_info; + + if (cprm->limit == 1) { + /* See umh_coredump_setup() which sets RLIMIT_CORE = 1. + * + * Normally core limits are irrelevant to pipes, since + * we're not writing to the file system, but we use + * cprm.limit of 1 here as a special value, this is a + * consistent way to catch recursive crashes. + * We can still crash if the core_pattern binary sets + * RLIM_CORE = !1, but it runs as root, and can do + * lots of stupid things. + * + * Note that we use task_tgid_vnr here to grab the pid + * of the process group leader. That way we get the + * right pid if a thread in a multi-threaded + * core_pattern process dies. + */ + coredump_report_failure("RLIMIT_CORE is set to 1, aborting core"); + return false; + } + cprm->limit = RLIM_INFINITY; + + cn->core_pipe_limit = atomic_inc_return(&core_pipe_count); + if (core_pipe_limit && (core_pipe_limit < cn->core_pipe_limit)) { + coredump_report_failure("over core_pipe_limit, skipping core dump"); + return false; + } + + helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv), GFP_KERNEL); + if (!helper_argv) { + coredump_report_failure("%s failed to allocate memory", __func__); + return false; + } + for (argi = 0; argi < argc; argi++) + helper_argv[argi] = cn->corename + argv[argi]; + helper_argv[argi] = NULL; + + sub_info = call_usermodehelper_setup(helper_argv[0], helper_argv, NULL, + GFP_KERNEL, umh_coredump_setup, + NULL, cprm); + if (!sub_info) + return false; + + if (call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC)) { + coredump_report_failure("|%s pipe failed", cn->corename); + return false; + } + + /* + * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would + * have this set to NULL. + */ + if (!cprm->file) { + coredump_report_failure("Core dump to |%s disabled", cn->corename); + return false; + } + + return true; +} + +static bool coredump_write(struct core_name *cn, + struct coredump_params *cprm, + struct linux_binfmt *binfmt) +{ + + if (dump_interrupted()) + return true; + + if (!dump_vma_snapshot(cprm)) + return false; + + file_start_write(cprm->file); + cn->core_dumped = binfmt->core_dump(cprm); + /* + * Ensures that file size is big enough to contain the current + * file postion. This prevents gdb from complaining about + * a truncated file if the last "write" to the file was + * dump_skip. + */ + if (cprm->to_skip) { + cprm->to_skip--; + dump_emit(cprm, "", 1); + } + file_end_write(cprm->file); + free_vma_snapshot(cprm); + return true; +} + +static void coredump_cleanup(struct core_name *cn, struct coredump_params *cprm) +{ + if (cprm->file) + filp_close(cprm->file, NULL); + if (cn->core_pipe_limit) { + VFS_WARN_ON_ONCE(cn->core_type != COREDUMP_PIPE); + atomic_dec(&core_pipe_count); + } + kfree(cn->corename); + coredump_finish(cn->core_dumped); +} + +static inline bool coredump_skip(const struct coredump_params *cprm, + const struct linux_binfmt *binfmt) +{ + if (!binfmt) + return true; + if (!binfmt->core_dump) + return true; + if (!__get_dumpable(cprm->mm_flags)) + return true; + return false; +} + +void vfs_coredump(const kernel_siginfo_t *siginfo) +{ + struct cred *cred __free(put_cred) = NULL; + size_t *argv __free(kfree) = NULL; struct core_state core_state; struct core_name cn; struct mm_struct *mm = current->mm; - struct linux_binfmt * binfmt; + struct linux_binfmt *binfmt = mm->binfmt; const struct cred *old_cred; - struct cred *cred; - int retval = 0; - size_t *argv = NULL; int argc = 0; - /* require nonrelative corefile path and be extra careful */ - bool need_suid_safe = false; - bool core_dumped = false; - static atomic_t core_dump_count = ATOMIC_INIT(0); struct coredump_params cprm = { .siginfo = siginfo, .limit = rlimit(RLIMIT_CORE), @@ -662,357 +1111,92 @@ void do_coredump(const kernel_siginfo_t *siginfo) audit_core_dumps(siginfo->si_signo); - binfmt = mm->binfmt; - if (!binfmt || !binfmt->core_dump) - goto fail; - if (!__get_dumpable(cprm.mm_flags)) - goto fail; + if (coredump_skip(&cprm, binfmt)) + return; cred = prepare_creds(); if (!cred) - goto fail; + return; /* * We cannot trust fsuid as being the "true" uid of the process * nor do we know its entire history. We only know it was tainted * so we dump it as root in mode 2, and only into a controlled * environment (pipe handler or fully qualified path). */ - if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) { - /* Setuid core dump mode */ - cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */ - need_suid_safe = true; - } + if (coredump_force_suid_safe(&cprm)) + cred->fsuid = GLOBAL_ROOT_UID; - retval = coredump_wait(siginfo->si_signo, &core_state); - if (retval < 0) - goto fail_creds; + if (coredump_wait(siginfo->si_signo, &core_state) < 0) + return; old_cred = override_creds(cred); - retval = format_corename(&cn, &cprm, &argv, &argc); - if (retval < 0) { + if (!coredump_parse(&cn, &cprm, &argv, &argc)) { coredump_report_failure("format_corename failed, aborting core"); - goto fail_unlock; + goto close_fail; } switch (cn.core_type) { - case COREDUMP_FILE: { - struct mnt_idmap *idmap; - struct inode *inode; - int open_flags = O_CREAT | O_WRONLY | O_NOFOLLOW | - O_LARGEFILE | O_EXCL; - - if (cprm.limit < binfmt->min_coredump) - goto fail_unlock; - - if (need_suid_safe && cn.corename[0] != '/') { - coredump_report_failure( - "this process can only dump core to a fully qualified path, skipping core dump"); - goto fail_unlock; - } - - /* - * Unlink the file if it exists unless this is a SUID - * binary - in that case, we're running around with root - * privs and don't want to unlink another user's coredump. - */ - if (!need_suid_safe) { - /* - * If it doesn't exist, that's fine. If there's some - * other problem, we'll catch it at the filp_open(). - */ - do_unlinkat(AT_FDCWD, getname_kernel(cn.corename)); - } - - /* - * There is a race between unlinking and creating the - * file, but if that causes an EEXIST here, that's - * fine - another process raced with us while creating - * the corefile, and the other process won. To userspace, - * what matters is that at least one of the two processes - * writes its coredump successfully, not which one. - */ - if (need_suid_safe) { - /* - * Using user namespaces, normal user tasks can change - * their current->fs->root to point to arbitrary - * directories. Since the intention of the "only dump - * with a fully qualified path" rule is to control where - * coredumps may be placed using root privileges, - * current->fs->root must not be used. Instead, use the - * root directory of init_task. - */ - struct path root; - - task_lock(&init_task); - get_fs_root(init_task.fs, &root); - task_unlock(&init_task); - cprm.file = file_open_root(&root, cn.corename, - open_flags, 0600); - path_put(&root); - } else { - cprm.file = filp_open(cn.corename, open_flags, 0600); - } - if (IS_ERR(cprm.file)) - goto fail_unlock; - - inode = file_inode(cprm.file); - if (inode->i_nlink > 1) - goto close_fail; - if (d_unhashed(cprm.file->f_path.dentry)) - goto close_fail; - /* - * AK: actually i see no reason to not allow this for named - * pipes etc, but keep the previous behaviour for now. - */ - if (!S_ISREG(inode->i_mode)) - goto close_fail; - /* - * Don't dump core if the filesystem changed owner or mode - * of the file during file creation. This is an issue when - * a process dumps core while its cwd is e.g. on a vfat - * filesystem. - */ - idmap = file_mnt_idmap(cprm.file); - if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), - current_fsuid())) { - coredump_report_failure("Core dump to %s aborted: " - "cannot preserve file owner", cn.corename); - goto close_fail; - } - if ((inode->i_mode & 0677) != 0600) { - coredump_report_failure("Core dump to %s aborted: " - "cannot preserve file permissions", cn.corename); - goto close_fail; - } - if (!(cprm.file->f_mode & FMODE_CAN_WRITE)) - goto close_fail; - if (do_truncate(idmap, cprm.file->f_path.dentry, - 0, 0, cprm.file)) + case COREDUMP_FILE: + if (!coredump_file(&cn, &cprm, binfmt)) goto close_fail; break; - } - case COREDUMP_PIPE: { - int argi; - int dump_count; - char **helper_argv; - struct subprocess_info *sub_info; - - if (cprm.limit == 1) { - /* See umh_coredump_setup() which sets RLIMIT_CORE = 1. - * - * Normally core limits are irrelevant to pipes, since - * we're not writing to the file system, but we use - * cprm.limit of 1 here as a special value, this is a - * consistent way to catch recursive crashes. - * We can still crash if the core_pattern binary sets - * RLIM_CORE = !1, but it runs as root, and can do - * lots of stupid things. - * - * Note that we use task_tgid_vnr here to grab the pid - * of the process group leader. That way we get the - * right pid if a thread in a multi-threaded - * core_pattern process dies. - */ - coredump_report_failure("RLIMIT_CORE is set to 1, aborting core"); - goto fail_unlock; - } - cprm.limit = RLIM_INFINITY; - - dump_count = atomic_inc_return(&core_dump_count); - if (core_pipe_limit && (core_pipe_limit < dump_count)) { - coredump_report_failure("over core_pipe_limit, skipping core dump"); - goto fail_dropcount; - } - - helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv), - GFP_KERNEL); - if (!helper_argv) { - coredump_report_failure("%s failed to allocate memory", __func__); - goto fail_dropcount; - } - for (argi = 0; argi < argc; argi++) - helper_argv[argi] = cn.corename + argv[argi]; - helper_argv[argi] = NULL; - - retval = -ENOMEM; - sub_info = call_usermodehelper_setup(helper_argv[0], - helper_argv, NULL, GFP_KERNEL, - umh_coredump_setup, NULL, &cprm); - if (sub_info) - retval = call_usermodehelper_exec(sub_info, - UMH_WAIT_EXEC); - - kfree(helper_argv); - if (retval) { - coredump_report_failure("|%s pipe failed", cn.corename); + case COREDUMP_PIPE: + if (!coredump_pipe(&cn, &cprm, argv, argc)) goto close_fail; - } break; - } - case COREDUMP_SOCK: { -#ifdef CONFIG_UNIX - struct file *file __free(fput) = NULL; - struct sockaddr_un addr = { - .sun_family = AF_UNIX, - }; - ssize_t addr_len; - struct socket *socket; - - addr_len = strscpy(addr.sun_path, cn.corename); - if (addr_len < 0) - goto close_fail; - addr_len += offsetof(struct sockaddr_un, sun_path) + 1; - - /* - * It is possible that the userspace process which is - * supposed to handle the coredump and is listening on - * the AF_UNIX socket coredumps. Userspace should just - * mark itself non dumpable. - */ - - retval = sock_create_kern(&init_net, AF_UNIX, SOCK_STREAM, 0, &socket); - if (retval < 0) - goto close_fail; - - file = sock_alloc_file(socket, 0, NULL); - if (IS_ERR(file)) - goto close_fail; - - /* - * Set the thread-group leader pid which is used for the - * peer credentials during connect() below. Then - * immediately register it in pidfs... - */ - cprm.pid = task_tgid(current); - retval = pidfs_register_pid(cprm.pid); - if (retval) - goto close_fail; - - /* - * ... and set the coredump information so userspace - * has it available after connect()... - */ - pidfs_coredump(&cprm); - - retval = kernel_connect(socket, (struct sockaddr *)(&addr), - addr_len, O_NONBLOCK | SOCK_COREDUMP); - - /* - * ... Make sure to only put our reference after connect() took - * its own reference keeping the pidfs entry alive ... - */ - pidfs_put_pid(cprm.pid); - - if (retval) { - if (retval == -EAGAIN) - coredump_report_failure("Coredump socket %s receive queue full", addr.sun_path); - else - coredump_report_failure("Coredump socket connection %s failed %d", addr.sun_path, retval); + case COREDUMP_SOCK_REQ: + fallthrough; + case COREDUMP_SOCK: + if (!coredump_socket(&cn, &cprm)) goto close_fail; - } - - /* ... and validate that @sk_peer_pid matches @cprm.pid. */ - if (WARN_ON_ONCE(unix_peer(socket->sk)->sk_peer_pid != cprm.pid)) - goto close_fail; - - cprm.limit = RLIM_INFINITY; - cprm.file = no_free_ptr(file); -#else - coredump_report_failure("Core dump socket support %s disabled", cn.corename); - goto close_fail; -#endif break; - } default: WARN_ON_ONCE(true); goto close_fail; } + /* Don't even generate the coredump. */ + if (cn.mask & COREDUMP_REJECT) + goto close_fail; + /* get us an unshared descriptor table; almost always a no-op */ /* The cell spufs coredump code reads the file descriptor tables */ - retval = unshare_files(); - if (retval) + if (unshare_files()) goto close_fail; - if (!dump_interrupted()) { - /* - * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would - * have this set to NULL. - */ - if (!cprm.file) { - coredump_report_failure("Core dump to |%s disabled", cn.corename); - goto close_fail; - } - if (!dump_vma_snapshot(&cprm)) - goto close_fail; - file_start_write(cprm.file); - core_dumped = binfmt->core_dump(&cprm); - /* - * Ensures that file size is big enough to contain the current - * file postion. This prevents gdb from complaining about - * a truncated file if the last "write" to the file was - * dump_skip. - */ - if (cprm.to_skip) { - cprm.to_skip--; - dump_emit(&cprm, "", 1); - } - file_end_write(cprm.file); - free_vma_snapshot(&cprm); - } + if ((cn.mask & COREDUMP_KERNEL) && !coredump_write(&cn, &cprm, binfmt)) + goto close_fail; -#ifdef CONFIG_UNIX - /* Let userspace know we're done processing the coredump. */ - if (sock_from_file(cprm.file)) - kernel_sock_shutdown(sock_from_file(cprm.file), SHUT_WR); -#endif + coredump_sock_shutdown(cprm.file); + + /* Let the parent know that a coredump was generated. */ + if (cn.mask & COREDUMP_USERSPACE) + cn.core_dumped = true; /* * When core_pipe_limit is set we wait for the coredump server * or usermodehelper to finish before exiting so it can e.g., * inspect /proc/<pid>. */ - if (core_pipe_limit) { + if (cn.mask & COREDUMP_WAIT) { switch (cn.core_type) { case COREDUMP_PIPE: wait_for_dump_helpers(cprm.file); break; -#ifdef CONFIG_UNIX - case COREDUMP_SOCK: { - ssize_t n; - - /* - * We use a simple read to wait for the coredump - * processing to finish. Either the socket is - * closed or we get sent unexpected data. In - * both cases, we're done. - */ - n = __kernel_read(cprm.file, &(char){ 0 }, 1, NULL); - if (n != 0) - coredump_report_failure("Unexpected data on coredump socket"); + case COREDUMP_SOCK_REQ: + fallthrough; + case COREDUMP_SOCK: + coredump_sock_wait(cprm.file); break; - } -#endif default: break; } } close_fail: - if (cprm.file) - filp_close(cprm.file, NULL); -fail_dropcount: - if (cn.core_type == COREDUMP_PIPE) - atomic_dec(&core_dump_count); -fail_unlock: - kfree(argv); - kfree(cn.corename); - coredump_finish(core_dumped); + coredump_cleanup(&cn, &cprm); revert_creds(old_cred); -fail_creds: - put_cred(cred); -fail: return; } @@ -1238,6 +1422,8 @@ void validate_coredump_safety(void) static inline bool check_coredump_socket(void) { + const char *p; + if (core_pattern[0] != '@') return true; @@ -1249,8 +1435,25 @@ static inline bool check_coredump_socket(void) if (current->nsproxy->mnt_ns != init_task.nsproxy->mnt_ns) return false; - /* Must be an absolute path. */ - if (*(core_pattern + 1) != '/') + /* Must be an absolute path... */ + if (core_pattern[1] != '/') { + /* ... or the socket request protocol... */ + if (core_pattern[1] != '@') + return false; + /* ... and if so must be an absolute path. */ + if (core_pattern[2] != '/') + return false; + p = &core_pattern[2]; + } else { + p = &core_pattern[1]; + } + + /* The path obviously cannot exceed UNIX_PATH_MAX. */ + if (strlen(p) >= UNIX_PATH_MAX) + return false; + + /* Must not contain ".." in the path. */ + if (name_contains_dotdot(core_pattern)) return false; return true; diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index b84d1747a020..b002e9b734f9 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -17,7 +17,6 @@ #include <linux/fs.h> #include <linux/file.h> #include <linux/pagemap.h> -#include <linux/pfn_t.h> #include <linux/ramfs.h> #include <linux/init.h> #include <linux/string.h> @@ -412,8 +411,8 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) for (i = 0; i < pages && !ret; i++) { vm_fault_t vmf; unsigned long off = i * PAGE_SIZE; - pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV); - vmf = vmf_insert_mixed(vma, vma->vm_start + off, pfn); + vmf = vmf_insert_mixed(vma, vma->vm_start + off, + address + off); if (vmf & VM_FAULT_ERROR) ret = vm_fault_to_errno(vmf, 0); } diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 0ad8c30b8fa5..486fcb2ecf13 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -7,10 +7,12 @@ * Copyright (C) 2015, Motorola Mobility */ -#include <linux/pagemap.h> -#include <linux/module.h> #include <linux/bio.h> +#include <linux/export.h> +#include <linux/module.h> #include <linux/namei.h> +#include <linux/pagemap.h> + #include "fscrypt_private.h" /** @@ -165,8 +167,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, do { err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, du_index, ZERO_PAGE(0), pages[i], - du_size, offset, - GFP_NOFS); + du_size, offset); if (err) goto out; du_index++; diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index b74b5937e695..b6ccab524fde 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -20,12 +20,14 @@ * Special Publication 800-38E and IEEE P1619/D16. */ -#include <linux/pagemap.h> +#include <crypto/skcipher.h> +#include <linux/export.h> #include <linux/mempool.h> #include <linux/module.h> -#include <linux/scatterlist.h> +#include <linux/pagemap.h> #include <linux/ratelimit.h> -#include <crypto/skcipher.h> +#include <linux/scatterlist.h> + #include "fscrypt_private.h" static unsigned int num_prealloc_crypto_pages = 32; @@ -108,15 +110,13 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index, int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci, fscrypt_direction_t rw, u64 index, struct page *src_page, struct page *dest_page, - unsigned int len, unsigned int offs, - gfp_t gfp_flags) + unsigned int len, unsigned int offs) { + struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm; + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); union fscrypt_iv iv; - struct skcipher_request *req = NULL; - DECLARE_CRYPTO_WAIT(wait); struct scatterlist dst, src; - struct crypto_skcipher *tfm = ci->ci_enc_key.tfm; - int res = 0; + int err; if (WARN_ON_ONCE(len <= 0)) return -EINVAL; @@ -125,31 +125,23 @@ int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci, fscrypt_generate_iv(&iv, index, ci); - req = skcipher_request_alloc(tfm, gfp_flags); - if (!req) - return -ENOMEM; - skcipher_request_set_callback( req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - crypto_req_done, &wait); - + NULL, NULL); sg_init_table(&dst, 1); sg_set_page(&dst, dest_page, len, offs); sg_init_table(&src, 1); sg_set_page(&src, src_page, len, offs); skcipher_request_set_crypt(req, &src, &dst, len, &iv); if (rw == FS_DECRYPT) - res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait); + err = crypto_skcipher_decrypt(req); else - res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); - skcipher_request_free(req); - if (res) { + err = crypto_skcipher_encrypt(req); + if (err) fscrypt_err(ci->ci_inode, "%scryption failed for data unit %llu: %d", - (rw == FS_DECRYPT ? "De" : "En"), index, res); - return res; - } - return 0; + (rw == FS_DECRYPT ? "De" : "En"), index, err); + return err; } /** @@ -204,7 +196,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio, for (i = offs; i < offs + len; i += du_size, index++) { err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, index, &folio->page, ciphertext_page, - du_size, i, gfp_flags); + du_size, i); if (err) { fscrypt_free_bounce_page(ciphertext_page); return ERR_PTR(err); @@ -225,7 +217,6 @@ EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks); * @offs: Byte offset within @page at which the block to encrypt begins * @lblk_num: Filesystem logical block number of the block, i.e. the 0-based * number of the block within the file - * @gfp_flags: Memory allocation flags * * Encrypt a possibly-compressed filesystem block that is located in an * arbitrary page, not necessarily in the original pagecache page. The @inode @@ -237,13 +228,12 @@ EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks); */ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, - u64 lblk_num, gfp_t gfp_flags) + u64 lblk_num) { if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units)) return -EOPNOTSUPP; return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_ENCRYPT, - lblk_num, page, page, len, offs, - gfp_flags); + lblk_num, page, page, len, offs); } EXPORT_SYMBOL(fscrypt_encrypt_block_inplace); @@ -283,8 +273,7 @@ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, struct page *page = folio_page(folio, i >> PAGE_SHIFT); err = fscrypt_crypt_data_unit(ci, FS_DECRYPT, index, page, - page, du_size, i & ~PAGE_MASK, - GFP_NOFS); + page, du_size, i & ~PAGE_MASK); if (err) return err; } @@ -317,8 +306,7 @@ int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units)) return -EOPNOTSUPP; return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_DECRYPT, - lblk_num, page, page, len, offs, - GFP_NOFS); + lblk_num, page, page, len, offs); } EXPORT_SYMBOL(fscrypt_decrypt_block_inplace); diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 010f9c0a4c2f..f9f6713e144f 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -11,11 +11,13 @@ * This has not yet undergone a rigorous security audit. */ -#include <linux/namei.h> -#include <linux/scatterlist.h> #include <crypto/hash.h> #include <crypto/sha2.h> #include <crypto/skcipher.h> +#include <linux/export.h> +#include <linux/namei.h> +#include <linux/scatterlist.h> + #include "fscrypt_private.h" /* @@ -92,13 +94,12 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen) { - struct skcipher_request *req = NULL; - DECLARE_CRYPTO_WAIT(wait); const struct fscrypt_inode_info *ci = inode->i_crypt_info; - struct crypto_skcipher *tfm = ci->ci_enc_key.tfm; + struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm; + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); union fscrypt_iv iv; struct scatterlist sg; - int res; + int err; /* * Copy the filename to the output buffer for encrypting in-place and @@ -109,28 +110,17 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, memcpy(out, iname->name, iname->len); memset(out + iname->len, 0, olen - iname->len); - /* Initialize the IV */ fscrypt_generate_iv(&iv, 0, ci); - /* Set up the encryption request */ - req = skcipher_request_alloc(tfm, GFP_NOFS); - if (!req) - return -ENOMEM; - skcipher_request_set_callback(req, - CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - crypto_req_done, &wait); + skcipher_request_set_callback( + req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); sg_init_one(&sg, out, olen); skcipher_request_set_crypt(req, &sg, &sg, olen, &iv); - - /* Do the encryption */ - res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); - skcipher_request_free(req); - if (res < 0) { - fscrypt_err(inode, "Filename encryption failed: %d", res); - return res; - } - - return 0; + err = crypto_skcipher_encrypt(req); + if (err) + fscrypt_err(inode, "Filename encryption failed: %d", err); + return err; } EXPORT_SYMBOL_GPL(fscrypt_fname_encrypt); @@ -148,34 +138,25 @@ static int fname_decrypt(const struct inode *inode, const struct fscrypt_str *iname, struct fscrypt_str *oname) { - struct skcipher_request *req = NULL; - DECLARE_CRYPTO_WAIT(wait); - struct scatterlist src_sg, dst_sg; const struct fscrypt_inode_info *ci = inode->i_crypt_info; - struct crypto_skcipher *tfm = ci->ci_enc_key.tfm; + struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm; + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); union fscrypt_iv iv; - int res; - - /* Allocate request */ - req = skcipher_request_alloc(tfm, GFP_NOFS); - if (!req) - return -ENOMEM; - skcipher_request_set_callback(req, - CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - crypto_req_done, &wait); + struct scatterlist src_sg, dst_sg; + int err; - /* Initialize IV */ fscrypt_generate_iv(&iv, 0, ci); - /* Create decryption request */ + skcipher_request_set_callback( + req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); sg_init_one(&src_sg, iname->name, iname->len); sg_init_one(&dst_sg, oname->name, oname->len); skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, &iv); - res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait); - skcipher_request_free(req); - if (res < 0) { - fscrypt_err(inode, "Filename decryption failed: %d", res); - return res; + err = crypto_skcipher_decrypt(req); + if (err) { + fscrypt_err(inode, "Filename decryption failed: %d", err); + return err; } oname->len = strnlen(oname->name, iname->len); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index c1d92074b65c..d8b485b9881c 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -45,6 +45,24 @@ */ #undef FSCRYPT_MAX_KEY_SIZE +/* + * This mask is passed as the third argument to the crypto_alloc_*() functions + * to prevent fscrypt from using the Crypto API drivers for non-inline crypto + * engines. Those drivers have been problematic for fscrypt. fscrypt users + * have reported hangs and even incorrect en/decryption with these drivers. + * Since going to the driver, off CPU, and back again is really slow, such + * drivers can be over 50 times slower than the CPU-based code for fscrypt's + * workload. Even on platforms that lack AES instructions on the CPU, using the + * offloads has been shown to be slower, even staying with AES. (Of course, + * Adiantum is faster still, and is the recommended option on such platforms...) + * + * Note that fscrypt also supports inline crypto engines. Those don't use the + * Crypto API and work much better than the old-style (non-inline) engines. + */ +#define FSCRYPT_CRYPTOAPI_MASK \ + (CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY | \ + CRYPTO_ALG_KERN_DRIVER_ONLY) + #define FSCRYPT_CONTEXT_V1 1 #define FSCRYPT_CONTEXT_V2 2 @@ -221,7 +239,7 @@ struct fscrypt_symlink_data { * Normally only one of the fields will be non-NULL. */ struct fscrypt_prepared_key { - struct crypto_skcipher *tfm; + struct crypto_sync_skcipher *tfm; #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT struct blk_crypto_key *blk_key; #endif @@ -319,8 +337,7 @@ int fscrypt_initialize(struct super_block *sb); int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci, fscrypt_direction_t rw, u64 index, struct page *src_page, struct page *dest_page, - unsigned int len, unsigned int offs, - gfp_t gfp_flags); + unsigned int len, unsigned int offs); struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags); void __printf(3, 4) __cold diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c index 0f3028adc9c7..b1ef506cd341 100644 --- a/fs/crypto/hkdf.c +++ b/fs/crypto/hkdf.c @@ -8,8 +8,8 @@ */ #include <crypto/hash.h> -#include <crypto/sha2.h> #include <crypto/hkdf.h> +#include <crypto/sha2.h> #include "fscrypt_private.h" @@ -58,7 +58,7 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, u8 prk[HKDF_HASHLEN]; int err; - hmac_tfm = crypto_alloc_shash(HKDF_HMAC_ALG, 0, 0); + hmac_tfm = crypto_alloc_shash(HKDF_HMAC_ALG, 0, FSCRYPT_CRYPTOAPI_MASK); if (IS_ERR(hmac_tfm)) { fscrypt_err(NULL, "Error allocating " HKDF_HMAC_ALG ": %ld", PTR_ERR(hmac_tfm)); diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index d8d5049b8fe1..e0b32ac841f7 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -5,6 +5,8 @@ * Encryption hooks for higher-level filesystem operations. */ +#include <linux/export.h> + #include "fscrypt_private.h" /** diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 1d008c440cb6..caaff809765b 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -15,6 +15,7 @@ #include <linux/blk-crypto.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> +#include <linux/export.h> #include <linux/sched/mm.h> #include <linux/slab.h> #include <linux/uio.h> diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index ace369f13068..7557f6a88b8f 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -18,12 +18,13 @@ * information about these ioctls. */ -#include <linux/unaligned.h> #include <crypto/skcipher.h> +#include <linux/export.h> #include <linux/key-type.h> -#include <linux/random.h> #include <linux/once.h> +#include <linux/random.h> #include <linux/seq_file.h> +#include <linux/unaligned.h> #include "fscrypt_private.h" diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 0d71843af946..4f3b9ecbfe4e 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -9,6 +9,7 @@ */ #include <crypto/skcipher.h> +#include <linux/export.h> #include <linux/random.h> #include "fscrypt_private.h" @@ -96,14 +97,15 @@ select_encryption_mode(const union fscrypt_policy *policy, } /* Create a symmetric cipher object for the given encryption mode and key */ -static struct crypto_skcipher * +static struct crypto_sync_skcipher * fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key, const struct inode *inode) { - struct crypto_skcipher *tfm; + struct crypto_sync_skcipher *tfm; int err; - tfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0); + tfm = crypto_alloc_sync_skcipher(mode->cipher_str, 0, + FSCRYPT_CRYPTOAPI_MASK); if (IS_ERR(tfm)) { if (PTR_ERR(tfm) == -ENOENT) { fscrypt_warn(inode, @@ -123,21 +125,22 @@ fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key, * first time a mode is used. */ pr_info("fscrypt: %s using implementation \"%s\"\n", - mode->friendly_name, crypto_skcipher_driver_name(tfm)); + mode->friendly_name, + crypto_skcipher_driver_name(&tfm->base)); } - if (WARN_ON_ONCE(crypto_skcipher_ivsize(tfm) != mode->ivsize)) { + if (WARN_ON_ONCE(crypto_sync_skcipher_ivsize(tfm) != mode->ivsize)) { err = -EINVAL; goto err_free_tfm; } - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); - err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize); + crypto_sync_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); + err = crypto_sync_skcipher_setkey(tfm, raw_key, mode->keysize); if (err) goto err_free_tfm; return tfm; err_free_tfm: - crypto_free_skcipher(tfm); + crypto_free_sync_skcipher(tfm); return ERR_PTR(err); } @@ -150,7 +153,7 @@ err_free_tfm: int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_inode_info *ci) { - struct crypto_skcipher *tfm; + struct crypto_sync_skcipher *tfm; if (fscrypt_using_inline_encryption(ci)) return fscrypt_prepare_inline_crypt_key(prep_key, raw_key, @@ -174,7 +177,7 @@ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, void fscrypt_destroy_prepared_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key) { - crypto_free_skcipher(prep_key->tfm); + crypto_free_sync_skcipher(prep_key->tfm); fscrypt_destroy_inline_crypt_key(sb, prep_key); memzero_explicit(prep_key, sizeof(*prep_key)); } diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c index b70521c55132..c4d05168522b 100644 --- a/fs/crypto/keysetup_v1.c +++ b/fs/crypto/keysetup_v1.c @@ -48,39 +48,30 @@ static int derive_key_aes(const u8 *master_key, const u8 nonce[FSCRYPT_FILE_NONCE_SIZE], u8 *derived_key, unsigned int derived_keysize) { - int res = 0; - struct skcipher_request *req = NULL; - DECLARE_CRYPTO_WAIT(wait); - struct scatterlist src_sg, dst_sg; - struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); - - if (IS_ERR(tfm)) { - res = PTR_ERR(tfm); - tfm = NULL; - goto out; - } - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); - req = skcipher_request_alloc(tfm, GFP_KERNEL); - if (!req) { - res = -ENOMEM; - goto out; - } - skcipher_request_set_callback(req, - CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - crypto_req_done, &wait); - res = crypto_skcipher_setkey(tfm, nonce, FSCRYPT_FILE_NONCE_SIZE); - if (res < 0) - goto out; + struct crypto_sync_skcipher *tfm; + int err; - sg_init_one(&src_sg, master_key, derived_keysize); - sg_init_one(&dst_sg, derived_key, derived_keysize); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize, - NULL); - res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); -out: - skcipher_request_free(req); - crypto_free_skcipher(tfm); - return res; + tfm = crypto_alloc_sync_skcipher("ecb(aes)", 0, FSCRYPT_CRYPTOAPI_MASK); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + err = crypto_sync_skcipher_setkey(tfm, nonce, FSCRYPT_FILE_NONCE_SIZE); + if (err == 0) { + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); + struct scatterlist src_sg, dst_sg; + + skcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | + CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); + sg_init_one(&src_sg, master_key, derived_keysize); + sg_init_one(&dst_sg, derived_key, derived_keysize); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, + derived_keysize, NULL); + err = crypto_skcipher_encrypt(req); + } + crypto_free_sync_skcipher(tfm); + return err; } /* diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 701259991277..6ad30ae07c06 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -10,11 +10,13 @@ * Modified by Eric Biggers, 2019 for v2 policy support. */ +#include <linux/export.h> #include <linux/fs_context.h> +#include <linux/mount.h> #include <linux/random.h> #include <linux/seq_file.h> #include <linux/string.h> -#include <linux/mount.h> + #include "fscrypt_private.h" /** diff --git a/fs/d_path.c b/fs/d_path.c index 5f4da5c8d5db..bb365511066b 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -241,9 +241,9 @@ static void get_fs_root_rcu(struct fs_struct *fs, struct path *root) unsigned seq; do { - seq = read_seqcount_begin(&fs->seq); + seq = read_seqbegin(&fs->seq); *root = fs->root; - } while (read_seqcount_retry(&fs->seq, seq)); + } while (read_seqretry(&fs->seq, seq)); } /** @@ -385,10 +385,10 @@ static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root, unsigned seq; do { - seq = read_seqcount_begin(&fs->seq); + seq = read_seqbegin(&fs->seq); *root = fs->root; *pwd = fs->pwd; - } while (read_seqcount_retry(&fs->seq, seq)); + } while (read_seqretry(&fs->seq, seq)); } /* @@ -20,7 +20,6 @@ #include <linux/sched/signal.h> #include <linux/uio.h> #include <linux/vmstat.h> -#include <linux/pfn_t.h> #include <linux/sizes.h> #include <linux/mmu_notifier.h> #include <linux/iomap.h> @@ -76,9 +75,9 @@ static struct folio *dax_to_folio(void *entry) return page_folio(pfn_to_page(dax_to_pfn(entry))); } -static void *dax_make_entry(pfn_t pfn, unsigned long flags) +static void *dax_make_entry(unsigned long pfn, unsigned long flags) { - return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); + return xa_mk_value(flags | (pfn << DAX_SHIFT)); } static bool dax_is_locked(void *entry) @@ -257,7 +256,7 @@ static void *wait_entry_unlocked_exclusive(struct xa_state *xas, void *entry) wq = dax_entry_waitqueue(xas, entry, &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); - xas_pause(xas); + xas_reset(xas); xas_unlock_irq(xas); schedule(); finish_wait(wq, &ewait.wait); @@ -449,9 +448,6 @@ static void dax_associate_entry(void *entry, struct address_space *mapping, if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) return; - if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) - return; - index = linear_page_index(vma, address & ~(size - 1)); if (shared && (folio->mapping || dax_folio_is_shared(folio))) { if (folio->mapping) @@ -474,9 +470,6 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping, { struct folio *folio = dax_to_folio(entry); - if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) - return; - if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) return; @@ -719,7 +712,7 @@ retry: if (order > 0) flags |= DAX_PMD; - entry = dax_make_entry(pfn_to_pfn_t(0), flags); + entry = dax_make_entry(0, flags); dax_lock_entry(xas, entry); if (xas_error(xas)) goto out_unlock; @@ -768,12 +761,6 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping, pgoff_t end_idx; XA_STATE(xas, &mapping->i_pages, start_idx); - /* - * In the 'limited' case get_user_pages() for dax is disabled. - */ - if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) - return NULL; - if (!dax_mapping(mapping)) return NULL; @@ -1053,7 +1040,7 @@ static bool dax_fault_is_synchronous(const struct iomap_iter *iter, * appropriate. */ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, - const struct iomap_iter *iter, void *entry, pfn_t pfn, + const struct iomap_iter *iter, void *entry, unsigned long pfn, unsigned long flags) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; @@ -1251,7 +1238,7 @@ int dax_writeback_mapping_range(struct address_space *mapping, EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos, - size_t size, void **kaddr, pfn_t *pfnp) + size_t size, void **kaddr, unsigned long *pfnp) { pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); int id, rc = 0; @@ -1269,7 +1256,7 @@ static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos, rc = -EINVAL; if (PFN_PHYS(length) < size) goto out; - if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1)) + if (*pfnp & (PHYS_PFN(size)-1)) goto out; rc = 0; @@ -1373,12 +1360,12 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf, { struct inode *inode = iter->inode; unsigned long vaddr = vmf->address; - pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); + unsigned long pfn = my_zero_pfn(vaddr); vm_fault_t ret; *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE); - ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), false); + ret = vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), false); trace_dax_load_hole(inode, vmf, ret); return ret; } @@ -1395,14 +1382,14 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, struct folio *zero_folio; spinlock_t *ptl; pmd_t pmd_entry; - pfn_t pfn; + unsigned long pfn; zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm); if (unlikely(!zero_folio)) goto fallback; - pfn = page_to_pfn_t(&zero_folio->page); + pfn = page_to_pfn(&zero_folio->page); *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_PMD | DAX_ZERO_PAGE); @@ -1422,8 +1409,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); mm_inc_nr_ptes(vma->vm_mm); } - pmd_entry = mk_pmd(&zero_folio->page, vmf->vma->vm_page_prot); - pmd_entry = pmd_mkhuge(pmd_entry); + pmd_entry = folio_mk_pmd(zero_folio, vmf->vma->vm_page_prot); set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); spin_unlock(ptl); trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry); @@ -1792,7 +1778,8 @@ static vm_fault_t dax_fault_return(int error) * insertion for now and return the pfn so that caller can insert it after the * fsync is done. */ -static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn) +static vm_fault_t dax_fault_synchronous_pfnp(unsigned long *pfnp, + unsigned long pfn) { if (WARN_ON_ONCE(!pfnp)) return VM_FAULT_SIGBUS; @@ -1840,7 +1827,7 @@ static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf, * @pmd: distinguish whether it is a pmd fault */ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, - const struct iomap_iter *iter, pfn_t *pfnp, + const struct iomap_iter *iter, unsigned long *pfnp, struct xa_state *xas, void **entry, bool pmd) { const struct iomap *iomap = &iter->iomap; @@ -1851,7 +1838,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, unsigned long entry_flags = pmd ? DAX_PMD : 0; struct folio *folio; int ret, err = 0; - pfn_t pfn; + unsigned long pfn; void *kaddr; if (!pmd && vmf->cow_page) @@ -1888,16 +1875,15 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, folio_ref_inc(folio); if (pmd) - ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn_t_to_pfn(pfn)), - write); + ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn), write); else - ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), write); + ret = vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), write); folio_put(folio); return ret; } -static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, +static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, unsigned long *pfnp, int *iomap_errp, const struct iomap_ops *ops) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; @@ -1938,7 +1924,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, * the PTE we need to set up. If so just return and the fault will be * retried. */ - if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) { + if (pmd_trans_huge(*vmf->pmd)) { ret = VM_FAULT_NOPAGE; goto unlock_entry; } @@ -2009,7 +1995,7 @@ static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas, return false; } -static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, +static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp, const struct iomap_ops *ops) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; @@ -2061,8 +2047,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * the PMD we need to set up. If so just return and the fault will be * retried. */ - if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) && - !pmd_devmap(*vmf->pmd)) { + if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd)) { ret = 0; goto unlock_entry; } @@ -2091,7 +2076,7 @@ out: return ret; } #else -static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, +static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp, const struct iomap_ops *ops) { return VM_FAULT_FALLBACK; @@ -2112,7 +2097,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * successfully. */ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order, - pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) + unsigned long *pfnp, int *iomap_errp, + const struct iomap_ops *ops) { if (order == 0) return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops); @@ -2132,8 +2118,8 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault); * This function inserts a writeable PTE or PMD entry into the page tables * for an mmaped DAX file. It also marks the page cache entry as dirty. */ -static vm_fault_t -dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) +static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, + unsigned long pfn, unsigned int order) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order); @@ -2155,7 +2141,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) xas_set_mark(&xas, PAGECACHE_TAG_DIRTY); dax_lock_entry(&xas, entry); xas_unlock_irq(&xas); - folio = pfn_folio(pfn_t_to_pfn(pfn)); + folio = pfn_folio(pfn); folio_ref_inc(folio); if (order == 0) ret = vmf_insert_page_mkwrite(vmf, &folio->page, true); @@ -2182,7 +2168,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) * table entry. */ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order, - pfn_t pfn) + unsigned long pfn) { int err; loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; diff --git a/fs/dcache.c b/fs/dcache.c index 03d58b2d4fa3..60046ae23d51 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1436,7 +1436,7 @@ int d_set_mounted(struct dentry *dentry) { struct dentry *p; int ret = -ENOENT; - write_seqlock(&rename_lock); + read_seqlock_excl(&rename_lock); for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) { /* Need exclusion wrt. d_invalidate() */ spin_lock(&p->d_lock); @@ -1456,7 +1456,7 @@ int d_set_mounted(struct dentry *dentry) } spin_unlock(&dentry->d_lock); out: - write_sequnlock(&rename_lock); + read_sequnlock_excl(&rename_lock); return ret; } @@ -1731,14 +1731,14 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dentry->d_inode = NULL; dentry->d_parent = dentry; dentry->d_sb = sb; - dentry->d_op = NULL; + dentry->d_op = sb->__s_d_op; + dentry->d_flags = sb->s_d_flags; dentry->d_fsdata = NULL; INIT_HLIST_BL_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_HLIST_HEAD(&dentry->d_children); INIT_HLIST_NODE(&dentry->d_u.d_alias); INIT_HLIST_NODE(&dentry->d_sib); - d_set_d_op(dentry, dentry->d_sb->s_d_op); if (dentry->d_op && dentry->d_op->d_init) { err = dentry->d_op->d_init(dentry); @@ -1821,8 +1821,9 @@ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name) struct dentry *dentry = __d_alloc(sb, name); if (likely(dentry)) { dentry->d_flags |= DCACHE_NORCU; - if (!sb->s_d_op) - d_set_d_op(dentry, &anon_ops); + /* d_op_flags(&anon_ops) is 0 */ + if (!dentry->d_op) + dentry->d_op = &anon_ops; } return dentry; } @@ -1837,35 +1838,50 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name) } EXPORT_SYMBOL(d_alloc_name); -void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) +#define DCACHE_OP_FLAGS \ + (DCACHE_OP_HASH | DCACHE_OP_COMPARE | DCACHE_OP_REVALIDATE | \ + DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_DELETE | DCACHE_OP_PRUNE | \ + DCACHE_OP_REAL) + +static unsigned int d_op_flags(const struct dentry_operations *op) { + unsigned int flags = 0; + if (op) { + if (op->d_hash) + flags |= DCACHE_OP_HASH; + if (op->d_compare) + flags |= DCACHE_OP_COMPARE; + if (op->d_revalidate) + flags |= DCACHE_OP_REVALIDATE; + if (op->d_weak_revalidate) + flags |= DCACHE_OP_WEAK_REVALIDATE; + if (op->d_delete) + flags |= DCACHE_OP_DELETE; + if (op->d_prune) + flags |= DCACHE_OP_PRUNE; + if (op->d_real) + flags |= DCACHE_OP_REAL; + } + return flags; +} + +static void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) +{ + unsigned int flags = d_op_flags(op); WARN_ON_ONCE(dentry->d_op); - WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH | - DCACHE_OP_COMPARE | - DCACHE_OP_REVALIDATE | - DCACHE_OP_WEAK_REVALIDATE | - DCACHE_OP_DELETE | - DCACHE_OP_REAL)); + WARN_ON_ONCE(dentry->d_flags & DCACHE_OP_FLAGS); dentry->d_op = op; - if (!op) - return; - if (op->d_hash) - dentry->d_flags |= DCACHE_OP_HASH; - if (op->d_compare) - dentry->d_flags |= DCACHE_OP_COMPARE; - if (op->d_revalidate) - dentry->d_flags |= DCACHE_OP_REVALIDATE; - if (op->d_weak_revalidate) - dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE; - if (op->d_delete) - dentry->d_flags |= DCACHE_OP_DELETE; - if (op->d_prune) - dentry->d_flags |= DCACHE_OP_PRUNE; - if (op->d_real) - dentry->d_flags |= DCACHE_OP_REAL; - -} -EXPORT_SYMBOL(d_set_d_op); + if (flags) + dentry->d_flags |= flags; +} + +void set_default_d_op(struct super_block *s, const struct dentry_operations *ops) +{ + unsigned int flags = d_op_flags(ops); + s->__s_d_op = ops; + s->s_d_flags = (s->s_d_flags & ~DCACHE_OP_FLAGS) | flags; +} +EXPORT_SYMBOL(set_default_d_op); static unsigned d_flags_for_inode(struct inode *inode) { @@ -2530,13 +2546,19 @@ struct dentry *d_alloc_parallel(struct dentry *parent, unsigned int hash = name->hash; struct hlist_bl_head *b = in_lookup_hash(parent, hash); struct hlist_bl_node *node; - struct dentry *new = d_alloc(parent, name); + struct dentry *new = __d_alloc(parent->d_sb, name); struct dentry *dentry; unsigned seq, r_seq, d_seq; if (unlikely(!new)) return ERR_PTR(-ENOMEM); + new->d_flags |= DCACHE_PAR_LOOKUP; + spin_lock(&parent->d_lock); + new->d_parent = dget_dlock(parent); + hlist_add_head(&new->d_sib, &parent->d_children); + spin_unlock(&parent->d_lock); + retry: rcu_read_lock(); seq = smp_load_acquire(&parent->d_inode->i_dir_seq); @@ -2620,8 +2642,6 @@ retry: return dentry; } rcu_read_unlock(); - /* we can't take ->d_lock here; it's OK, though. */ - new->d_flags |= DCACHE_PAR_LOOKUP; new->d_wait = wq; hlist_bl_add_head(&new->d_u.d_in_lookup_hash, b); hlist_bl_unlock(b); @@ -2667,7 +2687,8 @@ EXPORT_SYMBOL(__d_lookup_unhash_wake); /* inode->i_lock held if inode is non-NULL */ -static inline void __d_add(struct dentry *dentry, struct inode *inode) +static inline void __d_add(struct dentry *dentry, struct inode *inode, + const struct dentry_operations *ops) { wait_queue_head_t *d_wait; struct inode *dir = NULL; @@ -2678,6 +2699,8 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode) n = start_dir_add(dir); d_wait = __d_lookup_unhash(dentry); } + if (unlikely(ops)) + d_set_d_op(dentry, ops); if (inode) { unsigned add_flags = d_flags_for_inode(inode); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); @@ -2709,7 +2732,7 @@ void d_add(struct dentry *entry, struct inode *inode) security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); } - __d_add(entry, inode); + __d_add(entry, inode, NULL); } EXPORT_SYMBOL(d_add); @@ -2774,10 +2797,10 @@ static void copy_name(struct dentry *dentry, struct dentry *target) * @target: new dentry * @exchange: exchange the two dentries * - * Update the dcache to reflect the move of a file name. Negative - * dcache entries should not be moved in this way. Caller must hold - * rename_lock, the i_mutex of the source and target directories, - * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). + * Update the dcache to reflect the move of a file name. Negative dcache + * entries should not be moved in this way. Caller must hold rename_lock, the + * i_rwsem of the source and target directories (exclusively), and the sb-> + * s_vfs_rename_mutex if they differ. See lock_rename(). */ static void __d_move(struct dentry *dentry, struct dentry *target, bool exchange) @@ -2923,7 +2946,7 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) * This helper attempts to cope with remotely renamed directories * * It assumes that the caller is already holding - * dentry->d_parent->d_inode->i_mutex, and rename_lock + * dentry->d_parent->d_inode->i_rwsem, and rename_lock * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... @@ -2961,30 +2984,8 @@ out_err: return ret; } -/** - * d_splice_alias - splice a disconnected dentry into the tree if one exists - * @inode: the inode which may have a disconnected dentry - * @dentry: a negative dentry which we want to point to the inode. - * - * If inode is a directory and has an IS_ROOT alias, then d_move that in - * place of the given dentry and return it, else simply d_add the inode - * to the dentry and return NULL. - * - * If a non-IS_ROOT directory is found, the filesystem is corrupt, and - * we should error out: directories can't have multiple aliases. - * - * This is needed in the lookup routine of any filesystem that is exportable - * (via knfsd) so that we can build dcache paths to directories effectively. - * - * If a dentry was found and moved, then it is returned. Otherwise NULL - * is returned. This matches the expected return value of ->lookup. - * - * Cluster filesystems may call this function with a negative, hashed dentry. - * In that case, we know that the inode will be a regular file, and also this - * will only occur during atomic_open. So we need to check for the dentry - * being already hashed only in the final case. - */ -struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) +struct dentry *d_splice_alias_ops(struct inode *inode, struct dentry *dentry, + const struct dentry_operations *ops) { if (IS_ERR(inode)) return ERR_CAST(inode); @@ -3030,9 +3031,37 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) } } out: - __d_add(dentry, inode); + __d_add(dentry, inode, ops); return NULL; } + +/** + * d_splice_alias - splice a disconnected dentry into the tree if one exists + * @inode: the inode which may have a disconnected dentry + * @dentry: a negative dentry which we want to point to the inode. + * + * If inode is a directory and has an IS_ROOT alias, then d_move that in + * place of the given dentry and return it, else simply d_add the inode + * to the dentry and return NULL. + * + * If a non-IS_ROOT directory is found, the filesystem is corrupt, and + * we should error out: directories can't have multiple aliases. + * + * This is needed in the lookup routine of any filesystem that is exportable + * (via knfsd) so that we can build dcache paths to directories effectively. + * + * If a dentry was found and moved, then it is returned. Otherwise NULL + * is returned. This matches the expected return value of ->lookup. + * + * Cluster filesystems may call this function with a negative, hashed dentry. + * In that case, we know that the inode will be a regular file, and also this + * will only occur during atomic_open. So we need to check for the dentry + * being already hashed only in the final case. + */ +struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) +{ + return d_splice_alias_ops(inode, dentry, NULL); +} EXPORT_SYMBOL(d_splice_alias); /* diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 69e9ddcb113d..3ec3324c2060 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -47,29 +47,12 @@ const struct file_operations debugfs_noop_file_operations = { #define F_DENTRY(filp) ((filp)->f_path.dentry) -const void *debugfs_get_aux(const struct file *file) +void *debugfs_get_aux(const struct file *file) { return DEBUGFS_I(file_inode(file))->aux; } EXPORT_SYMBOL_GPL(debugfs_get_aux); -const struct file_operations *debugfs_real_fops(const struct file *filp) -{ - struct debugfs_fsdata *fsd = F_DENTRY(filp)->d_fsdata; - - if (!fsd) { - /* - * Urgh, we've been called w/o a protecting - * debugfs_file_get(). - */ - WARN_ON(1); - return NULL; - } - - return fsd->real_fops; -} -EXPORT_SYMBOL_GPL(debugfs_real_fops); - enum dbgfs_get_mode { DBGFS_GET_ALREADY, DBGFS_GET_REGULAR, @@ -302,15 +285,13 @@ static int debugfs_locked_down(struct inode *inode, static int open_proxy_open(struct inode *inode, struct file *filp) { struct dentry *dentry = F_DENTRY(filp); - const struct file_operations *real_fops = NULL; + const struct file_operations *real_fops = DEBUGFS_I(inode)->real_fops; int r; r = __debugfs_file_get(dentry, DBGFS_GET_REGULAR); if (r) return r == -EIO ? -ENOENT : r; - real_fops = debugfs_real_fops(filp); - r = debugfs_locked_down(inode, filp, real_fops); if (r) goto out; @@ -352,7 +333,6 @@ static ret_type full_proxy_ ## name(proto) \ { \ struct dentry *dentry = F_DENTRY(filp); \ struct debugfs_fsdata *fsd = dentry->d_fsdata; \ - const struct file_operations *real_fops; \ ret_type r; \ \ if (!(fsd->methods & bit)) \ @@ -360,14 +340,13 @@ static ret_type full_proxy_ ## name(proto) \ r = debugfs_file_get(dentry); \ if (unlikely(r)) \ return r; \ - real_fops = debugfs_real_fops(filp); \ - r = real_fops->name(args); \ + r = fsd->real_fops->name(args); \ debugfs_file_put(dentry); \ return r; \ } -#define FULL_PROXY_FUNC_BOTH(name, ret_type, filp, proto, args, bit, ret) \ -static ret_type full_proxy_ ## name(proto) \ +#define SHORT_PROXY_FUNC(name, ret_type, filp, proto, args, bit, ret) \ +static ret_type short_proxy_ ## name(proto) \ { \ struct dentry *dentry = F_DENTRY(filp); \ struct debugfs_fsdata *fsd = dentry->d_fsdata; \ @@ -378,27 +357,38 @@ static ret_type full_proxy_ ## name(proto) \ r = debugfs_file_get(dentry); \ if (unlikely(r)) \ return r; \ - if (fsd->real_fops) \ - r = fsd->real_fops->name(args); \ - else \ - r = fsd->short_fops->name(args); \ + r = fsd->short_fops->name(args); \ debugfs_file_put(dentry); \ return r; \ } -FULL_PROXY_FUNC_BOTH(llseek, loff_t, filp, - PROTO(struct file *filp, loff_t offset, int whence), - ARGS(filp, offset, whence), HAS_LSEEK, -ESPIPE); +SHORT_PROXY_FUNC(llseek, loff_t, filp, + PROTO(struct file *filp, loff_t offset, int whence), + ARGS(filp, offset, whence), HAS_LSEEK, -ESPIPE); -FULL_PROXY_FUNC_BOTH(read, ssize_t, filp, - PROTO(struct file *filp, char __user *buf, size_t size, - loff_t *ppos), - ARGS(filp, buf, size, ppos), HAS_READ, -EINVAL); +FULL_PROXY_FUNC(llseek, loff_t, filp, + PROTO(struct file *filp, loff_t offset, int whence), + ARGS(filp, offset, whence), HAS_LSEEK, -ESPIPE); -FULL_PROXY_FUNC_BOTH(write, ssize_t, filp, - PROTO(struct file *filp, const char __user *buf, - size_t size, loff_t *ppos), - ARGS(filp, buf, size, ppos), HAS_WRITE, -EINVAL); +SHORT_PROXY_FUNC(read, ssize_t, filp, + PROTO(struct file *filp, char __user *buf, size_t size, + loff_t *ppos), + ARGS(filp, buf, size, ppos), HAS_READ, -EINVAL); + +FULL_PROXY_FUNC(read, ssize_t, filp, + PROTO(struct file *filp, char __user *buf, size_t size, + loff_t *ppos), + ARGS(filp, buf, size, ppos), HAS_READ, -EINVAL); + +SHORT_PROXY_FUNC(write, ssize_t, filp, + PROTO(struct file *filp, const char __user *buf, + size_t size, loff_t *ppos), + ARGS(filp, buf, size, ppos), HAS_WRITE, -EINVAL); + +FULL_PROXY_FUNC(write, ssize_t, filp, + PROTO(struct file *filp, const char __user *buf, + size_t size, loff_t *ppos), + ARGS(filp, buf, size, ppos), HAS_WRITE, -EINVAL); FULL_PROXY_FUNC(unlocked_ioctl, long, filp, PROTO(struct file *filp, unsigned int cmd, unsigned long arg), @@ -410,22 +400,21 @@ static __poll_t full_proxy_poll(struct file *filp, struct dentry *dentry = F_DENTRY(filp); struct debugfs_fsdata *fsd = dentry->d_fsdata; __poll_t r = 0; - const struct file_operations *real_fops; if (!(fsd->methods & HAS_POLL)) return DEFAULT_POLLMASK; if (debugfs_file_get(dentry)) return EPOLLHUP; - real_fops = debugfs_real_fops(filp); - r = real_fops->poll(filp, wait); + r = fsd->real_fops->poll(filp, wait); debugfs_file_put(dentry); return r; } -static int full_proxy_release(struct inode *inode, struct file *filp) +static int full_proxy_release(struct inode *inode, struct file *file) { - const struct file_operations *real_fops = debugfs_real_fops(filp); + struct debugfs_fsdata *fsd = F_DENTRY(file)->d_fsdata; + const struct file_operations *real_fops = fsd->real_fops; int r = 0; /* @@ -435,7 +424,7 @@ static int full_proxy_release(struct inode *inode, struct file *filp) * ->i_private is still being meaningful here. */ if (real_fops->release) - r = real_fops->release(inode, filp); + r = real_fops->release(inode, file); fops_put(real_fops); return r; @@ -517,9 +506,9 @@ static int full_proxy_open_short(struct inode *inode, struct file *filp) const struct file_operations debugfs_full_short_proxy_file_operations = { .open = full_proxy_open_short, - .llseek = full_proxy_llseek, - .read = full_proxy_read, - .write = full_proxy_write, + .llseek = short_proxy_llseek, + .read = short_proxy_read, + .write = short_proxy_write, }; ssize_t debugfs_attr_read(struct file *file, char __user *buf, diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 30c4944e1862..a0357b0cf362 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -258,7 +258,6 @@ static struct vfsmount *debugfs_automount(struct path *path) } static const struct dentry_operations debugfs_dops = { - .d_delete = always_delete_dentry, .d_release = debugfs_release_dentry, .d_automount = debugfs_automount, }; @@ -273,7 +272,8 @@ static int debugfs_fill_super(struct super_block *sb, struct fs_context *fc) return err; sb->s_op = &debugfs_super_operations; - sb->s_d_op = &debugfs_dops; + set_default_d_op(sb, &debugfs_dops); + sb->s_d_flags |= DCACHE_DONTCACHE; debugfs_apply_options(sb); @@ -384,27 +384,12 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) if (!parent) parent = debugfs_mount->mnt_root; - inode_lock(d_inode(parent)); - if (unlikely(IS_DEADDIR(d_inode(parent)))) - dentry = ERR_PTR(-ENOENT); - else - dentry = lookup_noperm(&QSTR(name), parent); - if (!IS_ERR(dentry) && d_really_is_positive(dentry)) { - if (d_is_dir(dentry)) - pr_err("Directory '%s' with parent '%s' already present!\n", - name, parent->d_name.name); - else - pr_err("File '%s' in directory '%s' already present!\n", - name, parent->d_name.name); - dput(dentry); - dentry = ERR_PTR(-EEXIST); - } - + dentry = simple_start_creating(parent, name); if (IS_ERR(dentry)) { - inode_unlock(d_inode(parent)); + if (dentry == ERR_PTR(-EEXIST)) + pr_err("'%s' already exists in '%pd'\n", name, parent); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } - return dentry; } @@ -459,7 +444,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, proxy_fops = &debugfs_noop_file_operations; inode->i_fop = proxy_fops; DEBUGFS_I(inode)->raw = real_fops; - DEBUGFS_I(inode)->aux = aux; + DEBUGFS_I(inode)->aux = (void *)aux; d_instantiate(dentry, inode); fsnotify_create(d_inode(dentry->d_parent), dentry); diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h index 93483fe84425..427987f81571 100644 --- a/fs/debugfs/internal.h +++ b/fs/debugfs/internal.h @@ -19,7 +19,7 @@ struct debugfs_inode_info { const struct debugfs_short_fops *short_fops; debugfs_automount_t automount; }; - const void *aux; + void *aux; }; static inline struct debugfs_inode_info *DEBUGFS_I(struct inode *inode) diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 9c20d78e41f6..fdf22264a8e9 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -381,7 +381,7 @@ static int devpts_fill_super(struct super_block *s, struct fs_context *fc) s->s_blocksize_bits = 10; s->s_magic = DEVPTS_SUPER_MAGIC; s->s_op = &devpts_sops; - s->s_d_op = &simple_dentry_operations; + s->s_d_flags = DCACHE_DONTCACHE; s->s_time_gran = 1; fsi->sb = s; diff --git a/fs/direct-io.c b/fs/direct-io.c index bbd05f1a2145..2267f5ae7f77 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -996,7 +996,7 @@ do_holes: dio_unpin_page(dio, page); goto out; } - zero_user(page, from, 1 << blkbits); + memzero_page(page, from, 1 << blkbits); sdio->block_in_file++; from += 1 << blkbits; dio->result += 1 << blkbits; @@ -1083,8 +1083,8 @@ static inline int drop_refcount(struct dio *dio) * The locking rules are governed by the flags parameter: * - if the flags value contains DIO_LOCKING we use a fancy locking * scheme for dumb filesystems. - * For writes this function is called under i_mutex and returns with - * i_mutex held, for reads, i_mutex is not held on entry, but it is + * For writes this function is called under i_rwsem and returns with + * i_rwsem held, for reads, i_rwsem is not held on entry, but it is * taken and dropped again before returning. * - if the flags value does NOT contain DIO_LOCKING we don't use any * internal locking but rather rely on the filesystem to synchronize @@ -1094,7 +1094,7 @@ static inline int drop_refcount(struct dio *dio) * counter before starting direct I/O, and decrement it once we are done. * Truncate can wait for it to reach zero to provide exclusion. It is * expected that filesystem provide exclusion between new direct I/O - * and truncates. For DIO_LOCKING filesystems this is done by i_mutex, + * and truncates. For DIO_LOCKING filesystems this is done by i_rwsem, * but other filesystems need to take care of this on their own. * * NOTE: if you pass "sdio" to anything by pointer make sure that function @@ -1279,7 +1279,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, /* * All block lookups have been performed. For READ requests - * we can let i_mutex go now that its achieved its purpose + * we can let i_rwsem go now that its achieved its purpose * of protecting us from looking up uninitialized blocks. */ if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING)) diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index e01d5f29f4d2..6dd3a524cd35 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -509,7 +509,7 @@ static void add_scan(struct dlm_ls *ls, struct dlm_rsb *r) void dlm_rsb_scan(struct timer_list *timer) { - struct dlm_ls *ls = from_timer(ls, timer, ls_scan_timer); + struct dlm_ls *ls = timer_container_of(ls, timer, ls_scan_timer); int our_nodeid = dlm_our_nodeid(); struct dlm_rsb *r; int rv; diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index ce0a3c5ed0ca..5f8f96da09fe 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -193,7 +193,7 @@ static int ecryptfs_mmap(struct file *file, struct vm_area_struct *vma) * natively. If FILESYSTEM_MAX_STACK_DEPTH > 2 or ecryptfs * allows recursive mounting, this will need to be extended. */ - if (!lower_file->f_op->mmap) + if (!can_mmap_file(lower_file)) return -ENODEV; return generic_file_mmap(file, vma); } diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 493d7f194956..72fbe1316ab8 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -635,10 +635,10 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, } rd.old_mnt_idmap = &nop_mnt_idmap; - rd.old_dir = d_inode(lower_old_dir_dentry); + rd.old_parent = lower_old_dir_dentry; rd.old_dentry = lower_old_dentry; rd.new_mnt_idmap = &nop_mnt_idmap; - rd.new_dir = d_inode(lower_new_dir_dentry); + rd.new_parent = lower_new_dir_dentry; rd.new_dentry = lower_new_dentry; rc = vfs_rename(&rd); if (rc) @@ -1124,13 +1124,13 @@ out: return rc; } -static int ecryptfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +static int ecryptfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { return vfs_fileattr_get(ecryptfs_dentry_to_lower(dentry), fa); } static int ecryptfs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); int rc; diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 8dd1d7189c3b..eab1beb846d3 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -20,6 +20,7 @@ #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/fs_stack.h> +#include <linux/sysfs.h> #include <linux/slab.h> #include <linux/magic.h> #include "ecryptfs_kernel.h" @@ -471,7 +472,7 @@ static int ecryptfs_get_tree(struct fs_context *fc) sbi = NULL; s->s_op = &ecryptfs_sops; s->s_xattr = ecryptfs_xattr_handlers; - s->s_d_op = &ecryptfs_dops; + set_default_d_op(s, &ecryptfs_dops); err = "Reading sb failed"; rc = kern_path(fc->source, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); @@ -764,7 +765,7 @@ static struct kobject *ecryptfs_kobj; static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr, char *buff) { - return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK); + return sysfs_emit(buff, "%d\n", ECRYPTFS_VERSIONING_MASK); } static struct kobj_attribute version_attr = __ATTR_RO(version); diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 60f0ac8744b5..2c2b12fedeae 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -228,7 +228,7 @@ out: /** * ecryptfs_write_begin - * @file: The eCryptfs file + * @iocb: I/O control block for the eCryptfs file * @mapping: The eCryptfs object * @pos: The file offset at which to start writing * @len: Length of the write @@ -239,7 +239,7 @@ out: * * Returns zero on success; non-zero otherwise */ -static int ecryptfs_write_begin(struct file *file, +static int ecryptfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) @@ -322,7 +322,7 @@ static int ecryptfs_write_begin(struct file *file, * Note, this will increase i_size. */ if (index != 0) { if (prev_page_end_size > i_size_read(mapping->host)) { - rc = ecryptfs_truncate(file->f_path.dentry, + rc = ecryptfs_truncate(iocb->ki_filp->f_path.dentry, prev_page_end_size); if (rc) { printk(KERN_ERR "%s: Error on attempt to " @@ -429,7 +429,7 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode) /** * ecryptfs_write_end - * @file: The eCryptfs file object + * @iocb: I/O control block for the eCryptfs file * @mapping: The eCryptfs object * @pos: The file position * @len: The length of the data (unused) @@ -437,7 +437,7 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode) * @folio: The eCryptfs folio * @fsdata: The fsdata (unused) */ -static int ecryptfs_write_end(struct file *file, +static int ecryptfs_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 98a7299a9ee9..2891614abf8d 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -138,7 +138,7 @@ const struct inode_operations efivarfs_dir_inode_operations = { }; static int -efivarfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +efivarfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { unsigned int i_flags; unsigned int flags = 0; @@ -154,7 +154,7 @@ efivarfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) static int efivarfs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { unsigned int i_flags = 0; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index c900d98bf494..c4a139911356 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -183,7 +183,6 @@ static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr) static const struct dentry_operations efivarfs_d_ops = { .d_compare = efivarfs_d_compare, .d_hash = efivarfs_d_hash, - .d_delete = always_delete_dentry, }; static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name) @@ -350,7 +349,8 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = EFIVARFS_MAGIC; sb->s_op = &efivarfs_ops; - sb->s_d_op = &efivarfs_d_ops; + set_default_d_op(sb, &efivarfs_d_ops); + sb->s_d_flags |= DCACHE_DONTCACHE; sb->s_time_gran = 1; if (!efivar_supports_writes()) @@ -390,10 +390,16 @@ static int efivarfs_reconfigure(struct fs_context *fc) return 0; } +static void efivarfs_free(struct fs_context *fc) +{ + kfree(fc->s_fs_info); +} + static const struct fs_context_operations efivarfs_context_ops = { .get_tree = efivarfs_get_tree, .parse_param = efivarfs_parse_param, .reconfigure = efivarfs_reconfigure, + .free = efivarfs_free, }; static int efivarfs_check_missing(efi_char16_t *name16, efi_guid_t vendor, diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 6beeb7063871..7b26efc271ee 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -147,6 +147,8 @@ config EROFS_FS_ZIP_ZSTD config EROFS_FS_ZIP_ACCEL bool "EROFS hardware decompression support" depends on EROFS_FS_ZIP + select CRYPTO + select CRYPTO_DEFLATE help Saying Y here includes hardware accelerator support for reading EROFS file systems containing compressed data. It gives better diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 6a329c329f43..3b1ba571c728 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -49,11 +49,18 @@ void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap) return buf->base + (offset & ~PAGE_MASK); } -void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) +int erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb, + bool in_metabox) { struct erofs_sb_info *sbi = EROFS_SB(sb); buf->file = NULL; + if (in_metabox) { + if (unlikely(!sbi->metabox_inode)) + return -EFSCORRUPTED; + buf->mapping = sbi->metabox_inode->i_mapping; + return 0; + } buf->off = sbi->dif0.fsoff; if (erofs_is_fileio_mode(sbi)) { buf->file = sbi->dif0.file; /* some fs like FUSE needs it */ @@ -62,13 +69,18 @@ void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) buf->mapping = sbi->dif0.fscache->inode->i_mapping; else buf->mapping = sb->s_bdev->bd_mapping; + return 0; } void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, - erofs_off_t offset, bool need_kmap) + erofs_off_t offset, bool in_metabox) { - erofs_init_metabuf(buf, sb); - return erofs_bread(buf, offset, need_kmap); + int err; + + err = erofs_init_metabuf(buf, sb, in_metabox); + if (err) + return ERR_PTR(err); + return erofs_bread(buf, offset, true); } int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) @@ -118,7 +130,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, unit) + unit * chunknr; - idx = erofs_read_metabuf(&buf, sb, pos, true); + idx = erofs_read_metabuf(&buf, sb, pos, erofs_inode_in_metabox(inode)); if (IS_ERR(idx)) { err = PTR_ERR(idx); goto out; @@ -214,9 +226,11 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) /* * bit 30: I/O error occurred on this folio + * bit 29: CPU has dirty data in D-cache (needs aliasing handling); * bit 0 - 29: remaining parts to complete this folio */ -#define EROFS_ONLINEFOLIO_EIO (1 << 30) +#define EROFS_ONLINEFOLIO_EIO 30 +#define EROFS_ONLINEFOLIO_DIRTY 29 void erofs_onlinefolio_init(struct folio *folio) { @@ -233,19 +247,23 @@ void erofs_onlinefolio_split(struct folio *folio) atomic_inc((atomic_t *)&folio->private); } -void erofs_onlinefolio_end(struct folio *folio, int err) +void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty) { int orig, v; do { orig = atomic_read((atomic_t *)&folio->private); - v = (orig - 1) | (err ? EROFS_ONLINEFOLIO_EIO : 0); + DBG_BUGON(orig <= 0); + v = dirty << EROFS_ONLINEFOLIO_DIRTY; + v |= (orig - 1) | (!!err << EROFS_ONLINEFOLIO_EIO); } while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig); - if (v & ~EROFS_ONLINEFOLIO_EIO) + if (v & (BIT(EROFS_ONLINEFOLIO_DIRTY) - 1)) return; folio->private = 0; - folio_end_read(folio, !(v & EROFS_ONLINEFOLIO_EIO)); + if (v & BIT(EROFS_ONLINEFOLIO_DIRTY)) + flush_dcache_folio(folio); + folio_end_read(folio, !(v & BIT(EROFS_ONLINEFOLIO_EIO))); } static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, @@ -258,51 +276,51 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, map.m_la = offset; map.m_llen = length; - ret = erofs_map_blocks(inode, &map); if (ret < 0) return ret; - mdev = (struct erofs_map_dev) { - .m_deviceid = map.m_deviceid, - .m_pa = map.m_pa, - }; - ret = erofs_map_dev(sb, &mdev); - if (ret) - return ret; - iomap->offset = map.m_la; - if (flags & IOMAP_DAX) - iomap->dax_dev = mdev.m_dif->dax_dev; - else - iomap->bdev = mdev.m_bdev; iomap->length = map.m_llen; iomap->flags = 0; iomap->private = NULL; - + iomap->addr = IOMAP_NULL_ADDR; if (!(map.m_flags & EROFS_MAP_MAPPED)) { iomap->type = IOMAP_HOLE; - iomap->addr = IOMAP_NULL_ADDR; - if (!iomap->length) - iomap->length = length; return 0; } + if (!(map.m_flags & EROFS_MAP_META) || !erofs_inode_in_metabox(inode)) { + mdev = (struct erofs_map_dev) { + .m_deviceid = map.m_deviceid, + .m_pa = map.m_pa, + }; + ret = erofs_map_dev(sb, &mdev); + if (ret) + return ret; + + if (flags & IOMAP_DAX) + iomap->dax_dev = mdev.m_dif->dax_dev; + else + iomap->bdev = mdev.m_bdev; + iomap->addr = mdev.m_dif->fsoff + mdev.m_pa; + if (flags & IOMAP_DAX) + iomap->addr += mdev.m_dif->dax_part_off; + } + if (map.m_flags & EROFS_MAP_META) { void *ptr; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; iomap->type = IOMAP_INLINE; - ptr = erofs_read_metabuf(&buf, sb, mdev.m_pa, true); + ptr = erofs_read_metabuf(&buf, sb, map.m_pa, + erofs_inode_in_metabox(inode)); if (IS_ERR(ptr)) return PTR_ERR(ptr); iomap->inline_data = ptr; iomap->private = buf.base; } else { iomap->type = IOMAP_MAPPED; - iomap->addr = mdev.m_dif->fsoff + mdev.m_pa; - if (flags & IOMAP_DAX) - iomap->addr += mdev.m_dif->dax_part_off; } return 0; } @@ -351,11 +369,16 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, */ static int erofs_read_folio(struct file *file, struct folio *folio) { + trace_erofs_read_folio(folio, true); + return iomap_read_folio(folio, &erofs_iomap_ops); } static void erofs_readahead(struct readahead_control *rac) { + trace_erofs_readahead(rac->mapping->host, readahead_index(rac), + readahead_count(rac), true); + return iomap_readahead(rac, &erofs_iomap_ops); } @@ -409,20 +432,20 @@ static const struct vm_operations_struct erofs_dax_vm_ops = { .huge_fault = erofs_dax_huge_fault, }; -static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma) +static int erofs_file_mmap_prepare(struct vm_area_desc *desc) { - if (!IS_DAX(file_inode(file))) - return generic_file_readonly_mmap(file, vma); + if (!IS_DAX(file_inode(desc->file))) + return generic_file_readonly_mmap_prepare(desc); - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + if ((desc->vm_flags & VM_SHARED) && (desc->vm_flags & VM_MAYWRITE)) return -EINVAL; - vma->vm_ops = &erofs_dax_vm_ops; - vm_flags_set(vma, VM_HUGEPAGE); + desc->vm_ops = &erofs_dax_vm_ops; + desc->vm_flags |= VM_HUGEPAGE; return 0; } #else -#define erofs_file_mmap generic_file_readonly_mmap +#define erofs_file_mmap_prepare generic_file_readonly_mmap_prepare #endif static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence) @@ -452,7 +475,7 @@ static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence) const struct file_operations erofs_file_fops = { .llseek = erofs_file_llseek, .read_iter = erofs_file_read_iter, - .mmap = erofs_file_mmap, + .mmap_prepare = erofs_file_mmap_prepare, .get_unmapped_area = thp_get_unmapped_area, .splice_read = filemap_splice_read, }; diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index bf62e2836b60..354762c9723f 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -301,13 +301,11 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, cur = min(cur, rq->outputsize); if (cur && rq->out[0]) { kin = kmap_local_page(rq->in[nrpages_in - 1]); - if (rq->out[0] == rq->in[nrpages_in - 1]) { + if (rq->out[0] == rq->in[nrpages_in - 1]) memmove(kin + rq->pageofs_out, kin + pi, cur); - flush_dcache_page(rq->out[0]); - } else { + else memcpy_to_page(rq->out[0], rq->pageofs_out, kin + pi, cur); - } kunmap_local(kin); } rq->outputsize -= cur; @@ -325,14 +323,12 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, po = (rq->pageofs_out + cur + pi) & ~PAGE_MASK; DBG_BUGON(no >= nrpages_out); cnt = min(insz - pi, PAGE_SIZE - po); - if (rq->out[no] == rq->in[ni]) { + if (rq->out[no] == rq->in[ni]) memmove(kin + po, kin + rq->pageofs_in + pi, cnt); - flush_dcache_page(rq->out[no]); - } else if (rq->out[no]) { + else if (rq->out[no]) memcpy_to_page(rq->out[no], po, kin + rq->pageofs_in + pi, cnt); - } pi += cnt; } while (pi < insz); kunmap_local(kin); @@ -471,7 +467,7 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb) return -EOPNOTSUPP; } - erofs_init_metabuf(&buf, sb); + (void)erofs_init_metabuf(&buf, sb, false); offset = EROFS_SUPER_OFFSET + sbi->sb_size; alg = 0; for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) { diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index 2fae209d0274..debf469ad6bd 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -34,7 +34,8 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, } if (!dir_emit(ctx, de_name, de_namelen, - le64_to_cpu(de->nid), d_type)) + erofs_nid_to_ino64(EROFS_SB(dir->i_sb), + le64_to_cpu(de->nid)), d_type)) return 1; ++de; ctx->pos += sizeof(struct erofs_dirent); @@ -47,8 +48,12 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) struct inode *dir = file_inode(f); struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct super_block *sb = dir->i_sb; + struct file_ra_state *ra = &f->f_ra; unsigned long bsz = sb->s_blocksize; unsigned int ofs = erofs_blkoff(sb, ctx->pos); + pgoff_t ra_pages = DIV_ROUND_UP_POW2( + EROFS_I_SB(dir)->dir_ra_bytes, PAGE_SIZE); + pgoff_t nr_pages = DIV_ROUND_UP_POW2(dir->i_size, PAGE_SIZE); int err = 0; bool initial = true; @@ -58,6 +63,21 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) struct erofs_dirent *de; unsigned int nameoff, maxsize; + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + break; + } + + /* readahead blocks to enhance performance for large directories */ + if (ra_pages) { + pgoff_t idx = DIV_ROUND_UP_POW2(ctx->pos, PAGE_SIZE); + pgoff_t pages = min(nr_pages - idx, ra_pages); + + if (pages > 1 && !ra_has_index(ra, idx)) + page_cache_sync_readahead(dir->i_mapping, ra, + f, idx, pages); + } + de = erofs_bread(&buf, dbstart, true); if (IS_ERR(de)) { erofs_err(sb, "failed to readdir of logical block %llu of nid %llu", @@ -88,6 +108,7 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) break; ctx->pos = dbstart + maxsize; ofs = 0; + cond_resched(); } erofs_put_metabuf(&buf); if (EROFS_I(dir)->dot_omitted && ctx->pos == dir->i_size) { diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 767fb4acdc93..377ee12b8b96 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -15,6 +15,7 @@ #define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001 #define EROFS_FEATURE_COMPAT_MTIME 0x00000002 #define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004 +#define EROFS_FEATURE_COMPAT_SHARED_EA_IN_METABOX 0x00000008 /* * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should @@ -31,8 +32,9 @@ #define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020 #define EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES 0x00000040 #define EROFS_FEATURE_INCOMPAT_48BIT 0x00000080 +#define EROFS_FEATURE_INCOMPAT_METABOX 0x00000100 #define EROFS_ALL_FEATURE_INCOMPAT \ - ((EROFS_FEATURE_INCOMPAT_48BIT << 1) - 1) + ((EROFS_FEATURE_INCOMPAT_METABOX << 1) - 1) #define EROFS_SB_EXTSLOT_SIZE 16 @@ -46,7 +48,7 @@ struct erofs_deviceslot { }; #define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot) -/* erofs on-disk super block (currently 128 bytes) */ +/* erofs on-disk super block (currently 144 bytes at maximum) */ struct erofs_super_block { __le32 magic; /* file system magic number */ __le32 checksum; /* crc32c to avoid unexpected on-disk overlap */ @@ -82,7 +84,9 @@ struct erofs_super_block { __u8 reserved[3]; __le32 build_time; /* seconds added to epoch for mkfs time */ __le64 rootnid_8b; /* (48BIT on) nid of root directory */ - __u8 reserved2[8]; + __le64 reserved2; + __le64 metabox_nid; /* (METABOX on) nid of the metabox inode */ + __le64 reserved3; /* [align to extslot 1] */ }; /* @@ -267,6 +271,9 @@ struct erofs_inode_chunk_index { __le32 startblk_lo; /* starting block number of this chunk */ }; +#define EROFS_DIRENT_NID_METABOX_BIT 63 +#define EROFS_DIRENT_NID_MASK (BIT_ULL(EROFS_DIRENT_NID_METABOX_BIT) - 1) + /* dirent sorts in alphabet order, thus we can do binary search */ struct erofs_dirent { __le64 nid; /* node number */ @@ -434,7 +441,7 @@ static inline void erofs_check_ondisk_layout_definitions(void) .h_clusterbits = 1 << Z_EROFS_FRAGMENT_INODE_BIT }; - BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128); + BUILD_BUG_ON(sizeof(struct erofs_super_block) != 144); BUILD_BUG_ON(sizeof(struct erofs_inode_compact) != 32); BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64); BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12); diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index 7d81f504bff0..b7b3432a9882 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -38,7 +38,7 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret) } else { bio_for_each_folio_all(fi, &rq->bio) { DBG_BUGON(folio_test_uptodate(fi.folio)); - erofs_onlinefolio_end(fi.folio, ret); + erofs_onlinefolio_end(fi.folio, ret, false); } } bio_uninit(&rq->bio); @@ -47,6 +47,7 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret) static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) { + const struct cred *old_cred; struct iov_iter iter; int ret; @@ -60,7 +61,9 @@ static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) rq->iocb.ki_flags = IOCB_DIRECT; iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt, rq->bio.bi_iter.bi_size); + old_cred = override_creds(rq->iocb.ki_filp->f_cred); ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter); + revert_creds(old_cred); if (ret != -EIOCBQUEUED) erofs_fileio_ki_complete(&rq->iocb, ret); } @@ -93,8 +96,6 @@ static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio) struct erofs_map_blocks *map = &io->map; unsigned int cur = 0, end = folio_size(folio), len, attached = 0; loff_t pos = folio_pos(folio), ofs; - struct iov_iter iter; - struct bio_vec bv; int err = 0; erofs_onlinefolio_init(folio); @@ -114,18 +115,12 @@ static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio) void *src; src = erofs_read_metabuf(&buf, inode->i_sb, - map->m_pa + ofs, true); + map->m_pa + ofs, erofs_inode_in_metabox(inode)); if (IS_ERR(src)) { err = PTR_ERR(src); break; } - bvec_set_folio(&bv, folio, len, cur); - iov_iter_bvec(&iter, ITER_DEST, &bv, 1, len); - if (copy_to_iter(src, len, &iter) != len) { - erofs_put_metabuf(&buf); - err = -EIO; - break; - } + memcpy_to_folio(folio, cur, src, len); erofs_put_metabuf(&buf); } else if (!(map->m_flags & EROFS_MAP_MAPPED)) { folio_zero_segment(folio, cur, cur + len); @@ -159,7 +154,7 @@ io_retry: } cur += len; } - erofs_onlinefolio_end(folio, err); + erofs_onlinefolio_end(folio, err, false); return err; } diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 9c9129bca346..362acf828279 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -102,8 +102,7 @@ static void erofs_fscache_req_io_put(struct erofs_fscache_io *io) erofs_fscache_req_put(req); } -static void erofs_fscache_req_end_io(void *priv, - ssize_t transferred_or_error, bool was_async) +static void erofs_fscache_req_end_io(void *priv, ssize_t transferred_or_error) { struct erofs_fscache_io *io = priv; struct erofs_fscache_rq *req = io->private; @@ -180,8 +179,7 @@ struct erofs_fscache_bio { struct bio_vec bvecs[BIO_MAX_VECS]; }; -static void erofs_fscache_bio_endio(void *priv, - ssize_t transferred_or_error, bool was_async) +static void erofs_fscache_bio_endio(void *priv, ssize_t transferred_or_error) { struct erofs_fscache_bio *io = priv; @@ -276,7 +274,8 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req) size_t size = map.m_llen; void *src; - src = erofs_read_metabuf(&buf, sb, map.m_pa, true); + src = erofs_read_metabuf(&buf, sb, map.m_pa, + erofs_inode_in_metabox(inode)); if (IS_ERR(src)) return PTR_ERR(src); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index a0ae0b4f7b01..9a2f59721522 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -29,6 +29,7 @@ static int erofs_read_inode(struct inode *inode) struct super_block *sb = inode->i_sb; erofs_blk_t blkaddr = erofs_blknr(sb, erofs_iloc(inode)); unsigned int ofs = erofs_blkoff(sb, erofs_iloc(inode)); + bool in_mbox = erofs_inode_in_metabox(inode); struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_sb_info *sbi = EROFS_SB(sb); erofs_blk_t addrmask = BIT_ULL(48) - 1; @@ -39,10 +40,10 @@ static int erofs_read_inode(struct inode *inode) void *ptr; int err = 0; - ptr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), true); + ptr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), in_mbox); if (IS_ERR(ptr)) { err = PTR_ERR(ptr); - erofs_err(sb, "failed to get inode (nid: %llu) page, err %d", + erofs_err(sb, "failed to read inode meta block (nid: %llu): %d", vi->nid, err); goto err_out; } @@ -78,10 +79,10 @@ static int erofs_read_inode(struct inode *inode) memcpy(&copied, dic, gotten); ptr = erofs_read_metabuf(&buf, sb, - erofs_pos(sb, blkaddr + 1), true); + erofs_pos(sb, blkaddr + 1), in_mbox); if (IS_ERR(ptr)) { err = PTR_ERR(ptr); - erofs_err(sb, "failed to get inode payload block (nid: %llu), err %d", + erofs_err(sb, "failed to read inode payload block (nid: %llu): %d", vi->nid, err); goto err_out; } @@ -264,13 +265,13 @@ static int erofs_fill_inode(struct inode *inode) * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down * so that it will fit. */ -static ino_t erofs_squash_ino(erofs_nid_t nid) +static ino_t erofs_squash_ino(struct super_block *sb, erofs_nid_t nid) { - ino_t ino = (ino_t)nid; + u64 ino64 = erofs_nid_to_ino64(EROFS_SB(sb), nid); if (sizeof(ino_t) < sizeof(erofs_nid_t)) - ino ^= nid >> (sizeof(erofs_nid_t) - sizeof(ino_t)) * 8; - return ino; + ino64 ^= ino64 >> (sizeof(erofs_nid_t) - sizeof(ino_t)) * 8; + return (ino_t)ino64; } static int erofs_iget5_eq(struct inode *inode, void *opaque) @@ -282,7 +283,7 @@ static int erofs_iget5_set(struct inode *inode, void *opaque) { const erofs_nid_t nid = *(erofs_nid_t *)opaque; - inode->i_ino = erofs_squash_ino(nid); + inode->i_ino = erofs_squash_ino(inode->i_sb, nid); EROFS_I(inode)->nid = nid; return 0; } @@ -291,7 +292,7 @@ struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid) { struct inode *inode; - inode = iget5_locked(sb, erofs_squash_ino(nid), erofs_iget5_eq, + inode = iget5_locked(sb, erofs_squash_ino(sb, nid), erofs_iget5_eq, erofs_iget5_set, &nid); if (!inode) return ERR_PTR(-ENOMEM); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index a32c03a80c70..4ccc5f0ee8df 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -125,6 +125,7 @@ struct erofs_sb_info { struct erofs_sb_lz4_info lz4; #endif /* CONFIG_EROFS_FS_ZIP */ struct inode *packed_inode; + struct inode *metabox_inode; struct erofs_dev_context *devs; u64 total_blocks; @@ -148,6 +149,7 @@ struct erofs_sb_info { /* what we really care is nid, rather than ino.. */ erofs_nid_t root_nid; erofs_nid_t packed_nid; + erofs_nid_t metabox_nid; /* used for statfs, f_files - f_favail */ u64 inos; @@ -157,6 +159,7 @@ struct erofs_sb_info { /* sysfs support */ struct kobject s_kobj; /* /sys/fs/erofs/<devname> */ struct completion s_kobj_unregister; + erofs_off_t dir_ra_bytes; /* fscache support */ struct fscache_volume *volume; @@ -227,8 +230,27 @@ EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS) EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE) EROFS_FEATURE_FUNCS(xattr_prefixes, incompat, INCOMPAT_XATTR_PREFIXES) EROFS_FEATURE_FUNCS(48bit, incompat, INCOMPAT_48BIT) +EROFS_FEATURE_FUNCS(metabox, incompat, INCOMPAT_METABOX) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER) +EROFS_FEATURE_FUNCS(shared_ea_in_metabox, compat, COMPAT_SHARED_EA_IN_METABOX) + +static inline u64 erofs_nid_to_ino64(struct erofs_sb_info *sbi, erofs_nid_t nid) +{ + if (!erofs_sb_has_metabox(sbi)) + return nid; + + /* + * When metadata compression is enabled, avoid generating excessively + * large inode numbers for metadata-compressed inodes. Shift NIDs in + * the 31-62 bit range left by one and move the metabox flag to bit 31. + * + * Note: on-disk NIDs remain unchanged as they are primarily used for + * compatibility with non-LFS 32-bit applications. + */ + return ((nid << 1) & GENMASK_ULL(63, 32)) | (nid & GENMASK(30, 0)) | + ((nid >> EROFS_DIRENT_NID_METABOX_BIT) << 31); +} /* atomic flag definitions */ #define EROFS_I_EA_INITED_BIT 0 @@ -238,6 +260,9 @@ EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER) #define EROFS_I_BL_XATTR_BIT (BITS_PER_LONG - 1) #define EROFS_I_BL_Z_BIT (BITS_PER_LONG - 2) +/* default readahead size of directories */ +#define EROFS_DIR_RA_BYTES 16384 + struct erofs_inode { erofs_nid_t nid; @@ -279,12 +304,20 @@ struct erofs_inode { #define EROFS_I(ptr) container_of(ptr, struct erofs_inode, vfs_inode) +static inline bool erofs_inode_in_metabox(struct inode *inode) +{ + return EROFS_I(inode)->nid & BIT_ULL(EROFS_DIRENT_NID_METABOX_BIT); +} + static inline erofs_off_t erofs_iloc(struct inode *inode) { struct erofs_sb_info *sbi = EROFS_I_SB(inode); + erofs_nid_t nid_lo = EROFS_I(inode)->nid & EROFS_DIRENT_NID_MASK; + if (erofs_inode_in_metabox(inode)) + return nid_lo << sbi->islotbits; return erofs_pos(inode->i_sb, sbi->meta_blkaddr) + - (EROFS_I(inode)->nid << sbi->islotbits); + (nid_lo << sbi->islotbits); } static inline unsigned int erofs_inode_version(unsigned int ifmt) @@ -315,10 +348,12 @@ static inline struct folio *erofs_grab_folio_nowait(struct address_space *as, /* The length of extent is full */ #define EROFS_MAP_FULL_MAPPED 0x0008 /* Located in the special packed inode */ -#define EROFS_MAP_FRAGMENT 0x0010 +#define __EROFS_MAP_FRAGMENT 0x0010 /* The extent refers to partial decompressed data */ #define EROFS_MAP_PARTIAL_REF 0x0020 +#define EROFS_MAP_FRAGMENT (EROFS_MAP_MAPPED | __EROFS_MAP_FRAGMENT) + struct erofs_map_blocks { struct erofs_buf buf; @@ -381,16 +416,17 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, void erofs_unmap_metabuf(struct erofs_buf *buf); void erofs_put_metabuf(struct erofs_buf *buf); void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap); -void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb); +int erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb, + bool in_metabox); void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, - erofs_off_t offset, bool need_kmap); + erofs_off_t offset, bool in_metabox); int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map); void erofs_onlinefolio_init(struct folio *folio); void erofs_onlinefolio_split(struct folio *folio); -void erofs_onlinefolio_end(struct folio *folio, int err); +void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty); struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid); int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, diff --git a/fs/erofs/super.c b/fs/erofs/super.c index e1e9f06e8342..e1020aa60771 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -141,7 +141,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_deviceslot *dis; struct file *file; - dis = erofs_read_metabuf(buf, sb, *pos, true); + dis = erofs_read_metabuf(buf, sb, *pos, false); if (IS_ERR(dis)) return PTR_ERR(dis); @@ -258,7 +258,7 @@ static int erofs_read_superblock(struct super_block *sb) void *data; int ret; - data = erofs_read_metabuf(&buf, sb, 0, true); + data = erofs_read_metabuf(&buf, sb, 0, false); if (IS_ERR(data)) { erofs_err(sb, "cannot read erofs superblock"); return PTR_ERR(data); @@ -319,6 +319,14 @@ static int erofs_read_superblock(struct super_block *sb) sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b); } sbi->packed_nid = le64_to_cpu(dsb->packed_nid); + if (erofs_sb_has_metabox(sbi)) { + if (sbi->sb_size <= offsetof(struct erofs_super_block, + metabox_nid)) + return -EFSCORRUPTED; + sbi->metabox_nid = le64_to_cpu(dsb->metabox_nid); + if (sbi->metabox_nid & BIT_ULL(EROFS_DIRENT_NID_METABOX_BIT)) + return -EFSCORRUPTED; /* self-loop detection */ + } sbi->inos = le64_to_cpu(dsb->inos); sbi->epoch = (s64)le64_to_cpu(dsb->epoch); @@ -335,6 +343,8 @@ static int erofs_read_superblock(struct super_block *sb) if (erofs_sb_has_48bit(sbi)) erofs_info(sb, "EXPERIMENTAL 48-bit layout support in use. Use at your own risk!"); + if (erofs_sb_has_metabox(sbi)) + erofs_info(sb, "EXPERIMENTAL metadata compression support in use. Use at your own risk!"); if (erofs_is_fscache_mode(sb)) erofs_info(sb, "[deprecated] fscache-based on-demand read feature in use. Use at your own risk!"); out: @@ -690,6 +700,12 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return PTR_ERR(inode); sbi->packed_inode = inode; } + if (erofs_sb_has_metabox(sbi)) { + inode = erofs_iget(sb, sbi->metabox_nid); + if (IS_ERR(inode)) + return PTR_ERR(inode); + sbi->metabox_inode = inode; + } inode = erofs_iget(sb, sbi->root_nid); if (IS_ERR(inode)) @@ -715,6 +731,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; + sbi->dir_ra_bytes = EROFS_DIR_RA_BYTES; erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid); return 0; } @@ -845,6 +862,8 @@ static void erofs_drop_internal_inodes(struct erofs_sb_info *sbi) { iput(sbi->packed_inode); sbi->packed_inode = NULL; + iput(sbi->metabox_inode); + sbi->metabox_inode = NULL; #ifdef CONFIG_EROFS_FS_ZIP iput(sbi->managed_cache); sbi->managed_cache = NULL; diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index eed8797a193f..1e0658a1d95b 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -65,12 +65,14 @@ EROFS_ATTR_FUNC(drop_caches, 0200); #ifdef CONFIG_EROFS_FS_ZIP_ACCEL EROFS_ATTR_FUNC(accel, 0644); #endif +EROFS_ATTR_RW_UI(dir_ra_bytes, erofs_sb_info); static struct attribute *erofs_sb_attrs[] = { #ifdef CONFIG_EROFS_FS_ZIP ATTR_LIST(sync_decompress), ATTR_LIST(drop_caches), #endif + ATTR_LIST(dir_ra_bytes), NULL, }; ATTRIBUTE_GROUPS(erofs_sb); @@ -95,6 +97,7 @@ EROFS_ATTR_FEATURE(ztailpacking); EROFS_ATTR_FEATURE(fragments); EROFS_ATTR_FEATURE(dedupe); EROFS_ATTR_FEATURE(48bit); +EROFS_ATTR_FEATURE(metabox); static struct attribute *erofs_feat_attrs[] = { ATTR_LIST(zero_padding), @@ -108,6 +111,7 @@ static struct attribute *erofs_feat_attrs[] = { ATTR_LIST(fragments), ATTR_LIST(dedupe), ATTR_LIST(48bit), + ATTR_LIST(metabox), NULL, }; ATTRIBUTE_GROUPS(erofs_feat); diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 9cf84717a92e..eaa9efd766ee 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -72,12 +72,14 @@ static int erofs_init_inode_xattrs(struct inode *inode) ret = -EFSCORRUPTED; goto out_unlock; /* xattr ondisk layout error */ } - ret = -ENOATTR; + ret = -ENODATA; goto out_unlock; } it.buf = __EROFS_BUF_INITIALIZER; - erofs_init_metabuf(&it.buf, sb); + ret = erofs_init_metabuf(&it.buf, sb, erofs_inode_in_metabox(inode)); + if (ret) + goto out_unlock; it.pos = erofs_iloc(inode) + vi->inode_isize; /* read in shared xattr array (non-atomic, see kmalloc below) */ @@ -266,20 +268,20 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it) (entry.e_name_index & EROFS_XATTR_LONG_PREFIX_MASK); if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count) - return -ENOATTR; + return -ENODATA; if (it->index != pf->prefix->base_index || it->name.len != entry.e_name_len + pf->infix_len) - return -ENOATTR; + return -ENODATA; if (memcmp(it->name.name, pf->prefix->infix, pf->infix_len)) - return -ENOATTR; + return -ENODATA; it->infix_len = pf->infix_len; } else { if (it->index != entry.e_name_index || it->name.len != entry.e_name_len) - return -ENOATTR; + return -ENODATA; it->infix_len = 0; } @@ -295,7 +297,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it) entry.e_name_len - processed); if (memcmp(it->name.name + it->infix_len + processed, it->kaddr, slice)) - return -ENOATTR; + return -ENODATA; it->pos += slice; } @@ -323,9 +325,12 @@ static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it, sizeof(u32) * vi->xattr_shared_count; if (xattr_header_sz >= vi->xattr_isize) { DBG_BUGON(xattr_header_sz > vi->xattr_isize); - return -ENOATTR; + return -ENODATA; } + ret = erofs_init_metabuf(&it->buf, it->sb, erofs_inode_in_metabox(inode)); + if (ret) + return ret; remaining = vi->xattr_isize - xattr_header_sz; it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz; @@ -347,7 +352,7 @@ static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it, ret = erofs_getxattr_foreach(it); else ret = erofs_listxattr_foreach(it); - if ((getxattr && ret != -ENOATTR) || (!getxattr && ret)) + if ((getxattr && ret != -ENODATA) || (!getxattr && ret)) break; it->pos = next_pos; @@ -361,12 +366,17 @@ static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it, struct erofs_inode *const vi = EROFS_I(inode); struct super_block *const sb = it->sb; struct erofs_sb_info *sbi = EROFS_SB(sb); - unsigned int i; - int ret = -ENOATTR; + unsigned int i = 0; + int ret; - for (i = 0; i < vi->xattr_shared_count; ++i) { + ret = erofs_init_metabuf(&it->buf, sb, + erofs_sb_has_shared_ea_in_metabox(sbi)); + if (ret) + return ret; + + while (i < vi->xattr_shared_count) { it->pos = erofs_pos(sb, sbi->xattr_blkaddr) + - vi->xattr_shared_xattrs[i] * sizeof(__le32); + vi->xattr_shared_xattrs[i++] * sizeof(__le32); it->kaddr = erofs_bread(&it->buf, it->pos, true); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); @@ -375,10 +385,10 @@ static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it, ret = erofs_getxattr_foreach(it); else ret = erofs_listxattr_foreach(it); - if ((getxattr && ret != -ENOATTR) || (!getxattr && ret)) + if ((getxattr && ret != -ENODATA) || (!getxattr && ret)) break; } - return ret; + return i ? ret : -ENODATA; } int erofs_getxattr(struct inode *inode, int index, const char *name, @@ -403,7 +413,7 @@ int erofs_getxattr(struct inode *inode, int index, const char *name, EROFS_XATTR_FILTER_SEED + index); hashbit &= EROFS_XATTR_FILTER_BITS - 1; if (vi->xattr_name_filter & (1U << hashbit)) - return -ENOATTR; + return -ENODATA; } it.index = index; @@ -413,13 +423,12 @@ int erofs_getxattr(struct inode *inode, int index, const char *name, it.sb = inode->i_sb; it.buf = __EROFS_BUF_INITIALIZER; - erofs_init_metabuf(&it.buf, it.sb); it.buffer = buffer; it.buffer_size = buffer_size; it.buffer_ofs = 0; ret = erofs_xattr_iter_inline(&it, inode, true); - if (ret == -ENOATTR) + if (ret == -ENODATA) ret = erofs_xattr_iter_shared(&it, inode, true); erofs_put_metabuf(&it.buf); return ret ? ret : it.buffer_ofs; @@ -432,23 +441,22 @@ ssize_t erofs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) struct inode *inode = d_inode(dentry); ret = erofs_init_inode_xattrs(inode); - if (ret == -ENOATTR) + if (ret == -ENODATA) return 0; if (ret) return ret; it.sb = dentry->d_sb; it.buf = __EROFS_BUF_INITIALIZER; - erofs_init_metabuf(&it.buf, it.sb); it.dentry = dentry; it.buffer = buffer; it.buffer_size = buffer_size; it.buffer_ofs = 0; ret = erofs_xattr_iter_inline(&it, inode, false); - if (!ret || ret == -ENOATTR) + if (!ret || ret == -ENODATA) ret = erofs_xattr_iter_shared(&it, inode, false); - if (ret == -ENOATTR) + if (ret == -ENODATA) ret = 0; erofs_put_metabuf(&it.buf); return ret ? ret : it.buffer_ofs; @@ -485,7 +493,7 @@ int erofs_xattr_prefixes_init(struct super_block *sb) if (sbi->packed_inode) buf.mapping = sbi->packed_inode->i_mapping; else - erofs_init_metabuf(&buf, sb); + (void)erofs_init_metabuf(&buf, sb, false); for (i = 0; i < sbi->xattr_prefix_count; i++) { void *ptr = erofs_read_metadata(sb, &buf, &pos, &len); @@ -539,7 +547,7 @@ struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu) rc = erofs_getxattr(inode, prefix, "", value, rc); } - if (rc == -ENOATTR) + if (rc == -ENODATA) acl = NULL; else if (rc < 0) acl = ERR_PTR(rc); diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index b246cd0e135e..6317caa8413e 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -10,9 +10,6 @@ #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> -/* Attribute not found */ -#define ENOATTR ENODATA - #ifdef CONFIG_EROFS_FS_XATTR extern const struct xattr_handler erofs_xattr_user_handler; extern const struct xattr_handler erofs_xattr_trusted_handler; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index fe8071844724..792f20888a8f 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -805,6 +805,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe) struct erofs_map_blocks *map = &fe->map; struct super_block *sb = fe->inode->i_sb; struct z_erofs_pcluster *pcl = NULL; + void *ptr; int ret; DBG_BUGON(fe->pcl); @@ -854,15 +855,17 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe) /* bind cache first when cached decompression is preferred */ z_erofs_bind_cache(fe); } else { - void *mptr; - - mptr = erofs_read_metabuf(&map->buf, sb, map->m_pa, false); - if (IS_ERR(mptr)) { - ret = PTR_ERR(mptr); - erofs_err(sb, "failed to get inline data %d", ret); + ret = erofs_init_metabuf(&map->buf, sb, + erofs_inode_in_metabox(fe->inode)); + if (ret) + return ret; + ptr = erofs_bread(&map->buf, map->m_pa, false); + if (IS_ERR(ptr)) { + ret = PTR_ERR(ptr); + erofs_err(sb, "failed to get inline folio %d", ret); return ret; } - get_page(map->buf.page); + folio_get(page_folio(map->buf.page)); WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page); fe->pcl->pageofs_in = map->m_pa & ~PAGE_MASK; fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; @@ -1034,7 +1037,7 @@ static int z_erofs_scan_folio(struct z_erofs_frontend *f, if (!(map->m_flags & EROFS_MAP_MAPPED)) { folio_zero_segment(folio, cur, end); tight = false; - } else if (map->m_flags & EROFS_MAP_FRAGMENT) { + } else if (map->m_flags & __EROFS_MAP_FRAGMENT) { erofs_off_t fpos = offset + cur - map->m_la; err = z_erofs_read_fragment(inode->i_sb, folio, cur, @@ -1091,7 +1094,7 @@ static int z_erofs_scan_folio(struct z_erofs_frontend *f, tight = (bs == PAGE_SIZE); } } while ((end = cur) > 0); - erofs_onlinefolio_end(folio, err); + erofs_onlinefolio_end(folio, err, false); return err; } @@ -1196,7 +1199,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_backend *be, int err) cur += len; } kunmap_local(dst); - erofs_onlinefolio_end(page_folio(bvi->bvec.page), err); + erofs_onlinefolio_end(page_folio(bvi->bvec.page), err, true); list_del(p); kfree(bvi); } @@ -1325,9 +1328,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err) /* must handle all compressed pages before actual file pages */ if (pcl->from_meta) { - page = pcl->compressed_bvecs[0].page; + folio_put(page_folio(pcl->compressed_bvecs[0].page)); WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL); - put_page(page); } else { /* managed folios are still left in compressed_bvecs[] */ for (i = 0; i < pclusterpages; ++i) { @@ -1355,7 +1357,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err) DBG_BUGON(z_erofs_page_is_invalidated(page)); if (!z_erofs_is_shortlived_page(page)) { - erofs_onlinefolio_end(page_folio(page), err); + erofs_onlinefolio_end(page_folio(page), err, true); continue; } if (pcl->algorithmformat != Z_EROFS_COMPRESSION_LZ4) { diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 14ea47f954f5..a93efd95c555 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -17,7 +17,7 @@ struct z_erofs_maprecorder { u16 delta[2]; erofs_blk_t pblk, compressedblks; erofs_off_t nextpackoff; - bool partialref; + bool partialref, in_mbox; }; static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, @@ -31,7 +31,7 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, struct z_erofs_lcluster_index *di; unsigned int advise; - di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, true); + di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, m->in_mbox); if (IS_ERR(di)) return PTR_ERR(di); m->lcn = lcn; @@ -146,7 +146,7 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, else return -EOPNOTSUPP; - in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, true); + in = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, m->in_mbox); if (IS_ERR(in)) return PTR_ERR(in); @@ -240,6 +240,13 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m, unsigned int lcn, bool lookahead) { + if (m->type >= Z_EROFS_LCLUSTER_TYPE_MAX) { + erofs_err(m->inode->i_sb, "unknown type %u @ lcn %u of nid %llu", + m->type, lcn, EROFS_I(m->inode)->nid); + DBG_BUGON(1); + return -EOPNOTSUPP; + } + switch (EROFS_I(m->inode)->datalayout) { case EROFS_INODE_COMPRESSED_FULL: return z_erofs_load_full_lcluster(m, lcn); @@ -265,12 +272,7 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, if (err) return err; - if (m->type >= Z_EROFS_LCLUSTER_TYPE_MAX) { - erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu", - m->type, lcn, vi->nid); - DBG_BUGON(1); - return -EOPNOTSUPP; - } else if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { + if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { lookback_distance = m->delta[0]; if (!lookback_distance) break; @@ -325,25 +327,18 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, DBG_BUGON(lcn == initial_lcn && m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD); - if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { - if (m->delta[0] != 1) { - erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; - } - if (m->compressedblks) - goto out; - } else if (m->type < Z_EROFS_LCLUSTER_TYPE_MAX) { - /* - * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type - * rather than CBLKCNT, it's a 1 block-sized pcluster. - */ - m->compressedblks = 1; - goto out; + if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD && m->delta[0] != 1) { + erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; } - erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; + + /* + * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type rather + * than CBLKCNT, it's a 1 block-sized pcluster. + */ + if (m->type != Z_EROFS_LCLUSTER_TYPE_NONHEAD || !m->compressedblks) + m->compressedblks = 1; out: m->map->m_plen = erofs_pos(sb, m->compressedblks); return 0; @@ -379,11 +374,6 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) if (lcn != headlcn) break; /* ends at the next HEAD lcluster */ m->delta[1] = 1; - } else { - erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu", - m->type, lcn, vi->nid); - DBG_BUGON(1); - return -EOPNOTSUPP; } lcn += m->delta[1]; } @@ -402,6 +392,7 @@ static int z_erofs_map_blocks_fo(struct inode *inode, struct z_erofs_maprecorder m = { .inode = inode, .map = map, + .in_mbox = erofs_inode_in_metabox(inode), }; int err = 0; unsigned int endoff, afmt; @@ -413,8 +404,7 @@ static int z_erofs_map_blocks_fo(struct inode *inode, !vi->z_tailextent_headlcn) { map->m_la = 0; map->m_llen = inode->i_size; - map->m_flags = EROFS_MAP_MAPPED | - EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT; + map->m_flags = EROFS_MAP_FRAGMENT; return 0; } initial_lcn = ofs >> lclusterbits; @@ -429,44 +419,33 @@ static int z_erofs_map_blocks_fo(struct inode *inode, map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED; end = (m.lcn + 1ULL) << lclusterbits; - switch (m.type) { - case Z_EROFS_LCLUSTER_TYPE_PLAIN: - case Z_EROFS_LCLUSTER_TYPE_HEAD1: - case Z_EROFS_LCLUSTER_TYPE_HEAD2: - if (endoff >= m.clusterofs) { - m.headtype = m.type; - map->m_la = (m.lcn << lclusterbits) | m.clusterofs; - /* - * For ztailpacking files, in order to inline data more - * effectively, special EOF lclusters are now supported - * which can have three parts at most. - */ - if (ztailpacking && end > inode->i_size) - end = inode->i_size; - break; - } - /* m.lcn should be >= 1 if endoff < m.clusterofs */ - if (!m.lcn) { - erofs_err(sb, "invalid logical cluster 0 at nid %llu", - vi->nid); - err = -EFSCORRUPTED; - goto unmap_out; + if (m.type != Z_EROFS_LCLUSTER_TYPE_NONHEAD && endoff >= m.clusterofs) { + m.headtype = m.type; + map->m_la = (m.lcn << lclusterbits) | m.clusterofs; + /* + * For ztailpacking files, in order to inline data more + * effectively, special EOF lclusters are now supported + * which can have three parts at most. + */ + if (ztailpacking && end > inode->i_size) + end = inode->i_size; + } else { + if (m.type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) { + /* m.lcn should be >= 1 if endoff < m.clusterofs */ + if (!m.lcn) { + erofs_err(sb, "invalid logical cluster 0 at nid %llu", + vi->nid); + err = -EFSCORRUPTED; + goto unmap_out; + } + end = (m.lcn << lclusterbits) | m.clusterofs; + map->m_flags |= EROFS_MAP_FULL_MAPPED; + m.delta[0] = 1; } - end = (m.lcn << lclusterbits) | m.clusterofs; - map->m_flags |= EROFS_MAP_FULL_MAPPED; - m.delta[0] = 1; - fallthrough; - case Z_EROFS_LCLUSTER_TYPE_NONHEAD: /* get the corresponding first chunk */ err = z_erofs_extent_lookback(&m, m.delta[0]); if (err) goto unmap_out; - break; - default: - erofs_err(sb, "unknown type %u @ offset %llu of nid %llu", - m.type, ofs, vi->nid); - err = -EOPNOTSUPP; - goto unmap_out; } if (m.partialref) map->m_flags |= EROFS_MAP_PARTIAL_REF; @@ -489,7 +468,7 @@ static int z_erofs_map_blocks_fo(struct inode *inode, goto unmap_out; } } else if (fragment && m.lcn == vi->z_tailextent_headlcn) { - map->m_flags |= EROFS_MAP_FRAGMENT; + map->m_flags = EROFS_MAP_FRAGMENT; } else { map->m_pa = erofs_pos(sb, m.pblk); err = z_erofs_get_extent_compressedlen(&m, initial_lcn); @@ -543,6 +522,7 @@ static int z_erofs_map_blocks_ext(struct inode *inode, unsigned int recsz = z_erofs_extent_recsize(vi->z_advise); erofs_off_t pos = round_up(Z_EROFS_MAP_HEADER_END(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize), recsz); + bool in_mbox = erofs_inode_in_metabox(inode); erofs_off_t lend = inode->i_size; erofs_off_t l, r, mid, pa, la, lstart; struct z_erofs_extent *ext; @@ -552,7 +532,7 @@ static int z_erofs_map_blocks_ext(struct inode *inode, map->m_flags = 0; if (recsz <= offsetof(struct z_erofs_extent, pstart_hi)) { if (recsz <= offsetof(struct z_erofs_extent, pstart_lo)) { - ext = erofs_read_metabuf(&map->buf, sb, pos, true); + ext = erofs_read_metabuf(&map->buf, sb, pos, in_mbox); if (IS_ERR(ext)) return PTR_ERR(ext); pa = le64_to_cpu(*(__le64 *)ext); @@ -565,7 +545,7 @@ static int z_erofs_map_blocks_ext(struct inode *inode, } for (; lstart <= map->m_la; lstart += 1 << vi->z_lclusterbits) { - ext = erofs_read_metabuf(&map->buf, sb, pos, true); + ext = erofs_read_metabuf(&map->buf, sb, pos, in_mbox); if (IS_ERR(ext)) return PTR_ERR(ext); map->m_plen = le32_to_cpu(ext->plen); @@ -585,7 +565,7 @@ static int z_erofs_map_blocks_ext(struct inode *inode, for (l = 0, r = vi->z_extents; l < r; ) { mid = l + (r - l) / 2; ext = erofs_read_metabuf(&map->buf, sb, - pos + mid * recsz, true); + pos + mid * recsz, in_mbox); if (IS_ERR(ext)) return PTR_ERR(ext); @@ -597,6 +577,10 @@ static int z_erofs_map_blocks_ext(struct inode *inode, if (la > map->m_la) { r = mid; + if (la > lend) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } lend = la; } else { l = mid + 1; @@ -613,7 +597,7 @@ static int z_erofs_map_blocks_ext(struct inode *inode, if (lstart < lend) { map->m_la = lstart; if (last && (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) { - map->m_flags |= EROFS_MAP_MAPPED | EROFS_MAP_FRAGMENT; + map->m_flags = EROFS_MAP_FRAGMENT; vi->z_fragmentoff = map->m_plen; if (recsz > offsetof(struct z_erofs_extent, pstart_lo)) vi->z_fragmentoff |= map->m_pa << 32; @@ -635,22 +619,15 @@ static int z_erofs_map_blocks_ext(struct inode *inode, } } map->m_llen = lend - map->m_la; - if (!last && map->m_llen < sb->s_blocksize) { - erofs_err(sb, "extent too small %llu @ offset %llu of nid %llu", - map->m_llen, map->m_la, vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; - } return 0; } -static int z_erofs_fill_inode_lazy(struct inode *inode) +static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map) { struct erofs_inode *const vi = EROFS_I(inode); struct super_block *const sb = inode->i_sb; int err, headnr; erofs_off_t pos; - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct z_erofs_map_header *h; if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { @@ -670,7 +647,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto out_unlock; pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); - h = erofs_read_metabuf(&buf, sb, pos, true); + h = erofs_read_metabuf(&map->buf, sb, pos, erofs_inode_in_metabox(inode)); if (IS_ERR(h)) { err = PTR_ERR(h); goto out_unlock; @@ -708,7 +685,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", headnr + 1, vi->z_algorithmtype[headnr], vi->nid); err = -EOPNOTSUPP; - goto out_put_metabuf; + goto out_unlock; } if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && @@ -717,7 +694,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu", vi->nid); err = -EFSCORRUPTED; - goto out_put_metabuf; + goto out_unlock; } if (vi->datalayout == EROFS_INODE_COMPRESSED_COMPACT && !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^ @@ -725,27 +702,25 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu", vi->nid); err = -EFSCORRUPTED; - goto out_put_metabuf; + goto out_unlock; } if (vi->z_idata_size || (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) { - struct erofs_map_blocks map = { + struct erofs_map_blocks tm = { .buf = __EROFS_BUF_INITIALIZER }; - err = z_erofs_map_blocks_fo(inode, &map, + err = z_erofs_map_blocks_fo(inode, &tm, EROFS_GET_BLOCKS_FINDTAIL); - erofs_put_metabuf(&map.buf); + erofs_put_metabuf(&tm.buf); if (err < 0) - goto out_put_metabuf; + goto out_unlock; } done: /* paired with smp_mb() at the beginning of the function */ smp_mb(); set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); -out_put_metabuf: - erofs_put_metabuf(&buf); out_unlock: clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags); return err; @@ -763,7 +738,7 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, map->m_la = inode->i_size; map->m_flags = 0; } else { - err = z_erofs_fill_inode_lazy(inode); + err = z_erofs_fill_inode(inode, map); if (!err) { if (vi->datalayout == EROFS_INODE_COMPRESSED_FULL && (vi->z_advise & Z_EROFS_ADVISE_EXTENTS)) @@ -799,7 +774,7 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset, iomap->length = map.m_llen; if (map.m_flags & EROFS_MAP_MAPPED) { iomap->type = IOMAP_MAPPED; - iomap->addr = map.m_flags & EROFS_MAP_FRAGMENT ? + iomap->addr = map.m_flags & __EROFS_MAP_FRAGMENT ? IOMAP_NULL_ADDR : map.m_pa; } else { iomap->type = IOMAP_HOLE; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index d4dbffdedd08..b22d6f819f78 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -218,6 +218,7 @@ struct eventpoll { /* used to optimize loop detection check */ u64 gen; struct hlist_head refs; + u8 loop_check_depth; /* * usage count, used together with epitem->dying to @@ -883,7 +884,7 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) kfree_rcu(epi, rcu); percpu_counter_dec(&ep->user->epoll_watches); - return ep_refcount_dec_and_test(ep); + return true; } /* @@ -891,14 +892,14 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) */ static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi) { - WARN_ON_ONCE(__ep_remove(ep, epi, false)); + if (__ep_remove(ep, epi, false)) + WARN_ON_ONCE(ep_refcount_dec_and_test(ep)); } static void ep_clear_and_put(struct eventpoll *ep) { struct rb_node *rbp, *next; struct epitem *epi; - bool dispose; /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) @@ -931,10 +932,8 @@ static void ep_clear_and_put(struct eventpoll *ep) cond_resched(); } - dispose = ep_refcount_dec_and_test(ep); mutex_unlock(&ep->mtx); - - if (dispose) + if (ep_refcount_dec_and_test(ep)) ep_free(ep); } @@ -1137,7 +1136,7 @@ again: dispose = __ep_remove(ep, epi, true); mutex_unlock(&ep->mtx); - if (dispose) + if (dispose && ep_refcount_dec_and_test(ep)) ep_free(ep); goto again; } @@ -2142,23 +2141,24 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, } /** - * ep_loop_check_proc - verify that adding an epoll file inside another - * epoll structure does not violate the constraints, in - * terms of closed loops, or too deep chains (which can - * result in excessive stack usage). + * ep_loop_check_proc - verify that adding an epoll file @ep inside another + * epoll file does not create closed loops, and + * determine the depth of the subtree starting at @ep * * @ep: the &struct eventpoll to be currently checked. * @depth: Current depth of the path being checked. * - * Return: %zero if adding the epoll @file inside current epoll - * structure @ep does not violate the constraints, or %-1 otherwise. + * Return: depth of the subtree, or INT_MAX if we found a loop or went too deep. */ static int ep_loop_check_proc(struct eventpoll *ep, int depth) { - int error = 0; + int result = 0; struct rb_node *rbp; struct epitem *epi; + if (ep->gen == loop_check_gen) + return ep->loop_check_depth; + mutex_lock_nested(&ep->mtx, depth + 1); ep->gen = loop_check_gen; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { @@ -2166,13 +2166,11 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth) if (unlikely(is_file_epoll(epi->ffd.file))) { struct eventpoll *ep_tovisit; ep_tovisit = epi->ffd.file->private_data; - if (ep_tovisit->gen == loop_check_gen) - continue; if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS) - error = -1; + result = INT_MAX; else - error = ep_loop_check_proc(ep_tovisit, depth + 1); - if (error != 0) + result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1); + if (result > EP_MAX_NESTS) break; } else { /* @@ -2186,9 +2184,25 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth) list_file(epi->ffd.file); } } + ep->loop_check_depth = result; mutex_unlock(&ep->mtx); - return error; + return result; +} + +/* ep_get_upwards_depth_proc - determine depth of @ep when traversed upwards */ +static int ep_get_upwards_depth_proc(struct eventpoll *ep, int depth) +{ + int result = 0; + struct epitem *epi; + + if (ep->gen == loop_check_gen) + return ep->loop_check_depth; + hlist_for_each_entry_rcu(epi, &ep->refs, fllink) + result = max(result, ep_get_upwards_depth_proc(epi->ep, depth + 1) + 1); + ep->gen = loop_check_gen; + ep->loop_check_depth = result; + return result; } /** @@ -2204,8 +2218,22 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth) */ static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to) { + int depth, upwards_depth; + inserting_into = ep; - return ep_loop_check_proc(to, 0); + /* + * Check how deep down we can get from @to, and whether it is possible + * to loop up to @ep. + */ + depth = ep_loop_check_proc(to, 0); + if (depth > EP_MAX_NESTS) + return -1; + /* Check how far up we can go from @ep. */ + rcu_read_lock(); + upwards_depth = ep_get_upwards_depth_proc(ep, 0); + rcu_read_unlock(); + + return (depth+1+upwards_depth > EP_MAX_NESTS) ? -1 : 0; } static void clear_tfile_check_list(void) diff --git a/fs/exec.c b/fs/exec.c index cfbb2b9ee3c9..2a1e5e4042a1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -78,6 +78,9 @@ #include <trace/events/sched.h> +/* For vma exec functions. */ +#include "../mm/internal.h" + static int bprm_creds_from_file(struct linux_binprm *bprm); int suid_dumpable = 0; @@ -111,6 +114,9 @@ static inline void put_binfmt(struct linux_binfmt * fmt) bool path_noexec(const struct path *path) { + /* If it's an anonymous inode make sure that we catch any shenanigans. */ + VFS_WARN_ON_ONCE(IS_ANON_FILE(d_inode(path->dentry)) && + !(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC)); return (path->mnt->mnt_flags & MNT_NOEXEC) || (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC); } @@ -182,60 +188,6 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, flush_cache_page(bprm->vma, pos, page_to_pfn(page)); } -static int __bprm_mm_init(struct linux_binprm *bprm) -{ - int err; - struct vm_area_struct *vma = NULL; - struct mm_struct *mm = bprm->mm; - - bprm->vma = vma = vm_area_alloc(mm); - if (!vma) - return -ENOMEM; - vma_set_anonymous(vma); - - if (mmap_write_lock_killable(mm)) { - err = -EINTR; - goto err_free; - } - - /* - * Need to be called with mmap write lock - * held, to avoid race with ksmd. - */ - err = ksm_execve(mm); - if (err) - goto err_ksm; - - /* - * Place the stack at the largest stack address the architecture - * supports. Later, we'll move this to an appropriate place. We don't - * use STACK_TOP because that can depend on attributes which aren't - * configured yet. - */ - BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); - vma->vm_end = STACK_TOP_MAX; - vma->vm_start = vma->vm_end - PAGE_SIZE; - vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); - vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); - - err = insert_vm_struct(mm, vma); - if (err) - goto err; - - mm->stack_vm = mm->total_vm = 1; - mmap_write_unlock(mm); - bprm->p = vma->vm_end - sizeof(void *); - return 0; -err: - ksm_exit(mm); -err_ksm: - mmap_write_unlock(mm); -err_free: - bprm->vma = NULL; - vm_area_free(vma); - return err; -} - static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= MAX_ARG_STRLEN; @@ -288,12 +240,6 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, { } -static int __bprm_mm_init(struct linux_binprm *bprm) -{ - bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); - return 0; -} - static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= bprm->p; @@ -322,9 +268,13 @@ static int bprm_mm_init(struct linux_binprm *bprm) bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; task_unlock(current->group_leader); - err = __bprm_mm_init(bprm); +#ifndef CONFIG_MMU + bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); +#else + err = create_init_stack_vma(bprm->mm, &bprm->vma, &bprm->p); if (err) goto err; +#endif return 0; @@ -654,7 +604,7 @@ int setup_arg_pages(struct linux_binprm *bprm, struct mm_struct *mm = current->mm; struct vm_area_struct *vma = bprm->vma; struct vm_area_struct *prev = NULL; - unsigned long vm_flags; + vm_flags_t vm_flags; unsigned long stack_base; unsigned long stack_size; unsigned long stack_expand; @@ -834,13 +784,15 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) if (IS_ERR(file)) return file; + if (path_noexec(&file->f_path)) + return ERR_PTR(-EACCES); + /* * In the past the regular type check was here. It moved to may_open() in * 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is * an invariant that all non-regular files error out before we get here. */ - if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) || - path_noexec(&file->f_path)) + if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode))) return ERR_PTR(-EACCES); err = exe_file_deny_write_access(file); @@ -1563,7 +1515,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) * state is protected by cred_guard_mutex we hold. */ n_fs = 1; - spin_lock(&p->fs->lock); + read_seqlock_excl(&p->fs->seq); rcu_read_lock(); for_other_threads(p, t) { if (t->fs == p->fs) @@ -1576,7 +1528,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) bprm->unsafe |= LSM_UNSAFE_SHARE; else p->fs->in_exec = 1; - spin_unlock(&p->fs->lock); + read_sequnlock_excl(&p->fs->seq); } static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 841a5b18e3df..6b82497572b4 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -532,11 +532,10 @@ int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) return blkdev_issue_flush(inode->i_sb->s_bdev); } -static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size) +static int exfat_extend_valid_size(struct inode *inode, loff_t new_valid_size) { int err; loff_t pos; - struct inode *inode = file_inode(file); struct exfat_inode_info *ei = EXFAT_I(inode); struct address_space *mapping = inode->i_mapping; const struct address_space_operations *ops = mapping->a_ops; @@ -551,14 +550,14 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size) if (pos + len > new_valid_size) len = new_valid_size - pos; - err = ops->write_begin(file, mapping, pos, len, &folio, NULL); + err = ops->write_begin(NULL, mapping, pos, len, &folio, NULL); if (err) goto out; off = offset_in_folio(folio, pos); folio_zero_new_buffers(folio, off, off + len); - err = ops->write_end(file, mapping, pos, len, len, folio, NULL); + err = ops->write_end(NULL, mapping, pos, len, len, folio, NULL); if (err < 0) goto out; pos += len; @@ -604,7 +603,7 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) } if (pos > valid_size) { - ret = exfat_extend_valid_size(file, pos); + ret = exfat_extend_valid_size(inode, pos); if (ret < 0 && ret != -ENOSPC) { exfat_err(inode->i_sb, "write: fail to zero from %llu to %llu(%zd)", @@ -665,7 +664,7 @@ static vm_fault_t exfat_page_mkwrite(struct vm_fault *vmf) start + vma->vm_end - vma->vm_start); if (ei->valid_size < end) { - err = exfat_extend_valid_size(file, end); + err = exfat_extend_valid_size(inode, end); if (err < 0) { inode_unlock(inode); return vmf_fs_error(err); @@ -683,13 +682,15 @@ static const struct vm_operations_struct exfat_file_vm_ops = { .page_mkwrite = exfat_page_mkwrite, }; -static int exfat_file_mmap(struct file *file, struct vm_area_struct *vma) +static int exfat_file_mmap_prepare(struct vm_area_desc *desc) { - if (unlikely(exfat_forced_shutdown(file_inode(file)->i_sb))) + struct file *file = desc->file; + + if (unlikely(exfat_forced_shutdown(file_inode(desc->file)->i_sb))) return -EIO; file_accessed(file); - vma->vm_ops = &exfat_file_vm_ops; + desc->vm_ops = &exfat_file_vm_ops; return 0; } @@ -710,7 +711,7 @@ const struct file_operations exfat_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = exfat_compat_ioctl, #endif - .mmap = exfat_file_mmap, + .mmap_prepare = exfat_file_mmap_prepare, .fsync = exfat_file_fsync, .splice_read = exfat_splice_read, .splice_write = iter_file_splice_write, diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index b22c02d6000f..c10844e1e16c 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -446,9 +446,10 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to) } } -static int exfat_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned int len, - struct folio **foliop, void **fsdata) +static int exfat_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned int len, + struct folio **foliop, void **fsdata) { int ret; @@ -463,15 +464,16 @@ static int exfat_write_begin(struct file *file, struct address_space *mapping, return ret; } -static int exfat_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned int len, unsigned int copied, - struct folio *folio, void *fsdata) +static int exfat_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned int len, unsigned int copied, + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; struct exfat_inode_info *ei = EXFAT_I(inode); int err; - err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); + err = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); if (err < len) exfat_write_failed(mapping, pos+len); diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 7ed858937d45..ea5c1334a214 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -667,9 +667,9 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) } if (sbi->options.utf8) - sb->s_d_op = &exfat_utf8_dentry_ops; + set_default_d_op(sb, &exfat_utf8_dentry_ops); else - sb->s_d_op = &exfat_dentry_ops; + set_default_d_op(sb, &exfat_dentry_ops); root_inode = new_inode(sb); if (!root_inode) { diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index cdefea17986a..d3e55de4a2a2 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -549,15 +549,13 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, goto err_result; } - inode_lock(target_dir->d_inode); - nresult = lookup_one(mnt_idmap(mnt), &QSTR(nbuf), target_dir); + nresult = lookup_one_unlocked(mnt_idmap(mnt), &QSTR(nbuf), target_dir); if (!IS_ERR(nresult)) { if (unlikely(nresult->d_inode != result->d_inode)) { dput(nresult); nresult = ERR_PTR(-ESTALE); } } - inode_unlock(target_dir->d_inode); /* * At this point we are done with the parent, but it's pinned * by the child dentry anyway. diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 402fecf90a44..b07b3b369710 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -87,7 +87,7 @@ static void ext2_commit_chunk(struct folio *folio, loff_t pos, unsigned len) struct inode *dir = mapping->host; inode_inc_iversion(dir); - block_write_end(NULL, mapping, pos, len, len, folio, NULL); + block_write_end(pos, len, len, folio); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 4025f875252a..cf97b76e9fd3 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -750,9 +750,9 @@ extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); /* ioctl.c */ -extern int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa); +extern int ext2_fileattr_get(struct dentry *dentry, struct file_kattr *fa); extern int ext2_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); extern long ext2_ioctl(struct file *, unsigned int, unsigned long); extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 10b061ac5bc0..76bddce462fc 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -122,17 +122,19 @@ static const struct vm_operations_struct ext2_dax_vm_ops = { .pfn_mkwrite = ext2_dax_fault, }; -static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) +static int ext2_file_mmap_prepare(struct vm_area_desc *desc) { + struct file *file = desc->file; + if (!IS_DAX(file_inode(file))) - return generic_file_mmap(file, vma); + return generic_file_mmap_prepare(desc); file_accessed(file); - vma->vm_ops = &ext2_dax_vm_ops; + desc->vm_ops = &ext2_dax_vm_ops; return 0; } #else -#define ext2_file_mmap generic_file_mmap +#define ext2_file_mmap_prepare generic_file_mmap_prepare #endif /* @@ -316,7 +318,7 @@ const struct file_operations ext2_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, #endif - .mmap = ext2_file_mmap, + .mmap_prepare = ext2_file_mmap_prepare, .open = ext2_file_open, .release = ext2_release_file, .fsync = ext2_fsync, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 30f8201c155f..e10c376843d7 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -895,9 +895,19 @@ int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { int ret; + loff_t i_size; inode_lock(inode); - len = min_t(u64, len, i_size_read(inode)); + i_size = i_size_read(inode); + /* + * iomap_fiemap() returns EINVAL for 0 length. Make sure we don't trim + * length to 0 but still trim the range as much as possible since + * ext2_get_blocks() iterates unmapped space block by block which is + * slow. + */ + if (i_size == 0) + i_size = 1; + len = min_t(u64, len, i_size); ret = iomap_fiemap(inode, fieinfo, start, len, &ext2_iomap_ops); inode_unlock(inode); @@ -915,7 +925,7 @@ static void ext2_readahead(struct readahead_control *rac) } static int -ext2_write_begin(struct file *file, struct address_space *mapping, +ext2_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { int ret; @@ -926,13 +936,14 @@ ext2_write_begin(struct file *file, struct address_space *mapping, return ret; } -static int ext2_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int ext2_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { int ret; - ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); + ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); if (ret < len) ext2_write_failed(mapping, pos + len); return ret; diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 44e04484e570..c3fea55b8efa 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -18,7 +18,7 @@ #include <linux/uaccess.h> #include <linux/fileattr.h> -int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int ext2_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct ext2_inode_info *ei = EXT2_I(d_inode(dentry)); @@ -28,7 +28,7 @@ int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa) } int ext2_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct ext2_inode_info *ei = EXT2_I(inode); diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index c48fd36b2d74..c9329ed5c094 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -703,7 +703,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) * possible we just missed a transaction commit that did so */ smp_mb(); - if (sbi->s_mb_free_pending == 0) { + if (atomic_read(&sbi->s_mb_free_pending) == 0) { if (test_opt(sb, DISCARD)) { atomic_inc(&sbi->s_retry_alloc_pending); flush_work(&sbi->s_discard_work); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 18373de980f2..01a6e2de7fc3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -157,7 +157,7 @@ enum criteria { /* * Reads each block group sequentially, performing disk IO if - * necessary, to find find_suitable block group. Tries to + * necessary, to find suitable block group. Tries to * allocate goal length but might trim the request if nothing * is found after enough tries. */ @@ -185,14 +185,8 @@ enum criteria { /* prefer goal again. length */ #define EXT4_MB_HINT_MERGE 0x0001 -/* blocks already reserved */ -#define EXT4_MB_HINT_RESERVED 0x0002 -/* metadata is being allocated */ -#define EXT4_MB_HINT_METADATA 0x0004 /* first blocks in the file */ #define EXT4_MB_HINT_FIRST 0x0008 -/* search for the best chunk */ -#define EXT4_MB_HINT_BEST 0x0010 /* data is being allocated */ #define EXT4_MB_HINT_DATA 0x0020 /* don't preallocate (for tails) */ @@ -213,15 +207,6 @@ enum criteria { #define EXT4_MB_USE_RESERVED 0x2000 /* Do strict check for free blocks while retrying block allocation */ #define EXT4_MB_STRICT_CHECK 0x4000 -/* Large fragment size list lookup succeeded at least once for - * CR_POWER2_ALIGNED */ -#define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000 -/* Avg fragment size rb tree lookup succeeded at least once for - * CR_GOAL_LEN_FAST */ -#define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000 -/* Avg fragment size rb tree lookup succeeded at least once for - * CR_BEST_AVAIL_LEN */ -#define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000 struct ext4_allocation_request { /* target inode for block we're allocating */ @@ -1608,16 +1593,14 @@ struct ext4_sb_info { unsigned short *s_mb_offsets; unsigned int *s_mb_maxs; unsigned int s_group_info_size; - unsigned int s_mb_free_pending; + atomic_t s_mb_free_pending; struct list_head s_freed_data_list[2]; /* List of blocks to be freed after commit completed */ struct list_head s_discard_list; struct work_struct s_discard_work; atomic_t s_retry_alloc_pending; - struct list_head *s_mb_avg_fragment_size; - rwlock_t *s_mb_avg_fragment_size_locks; - struct list_head *s_mb_largest_free_orders; - rwlock_t *s_mb_largest_free_orders_locks; + struct xarray *s_mb_avg_fragment_size; + struct xarray *s_mb_largest_free_orders; /* tunables */ unsigned long s_stripe; @@ -1629,15 +1612,16 @@ struct ext4_sb_info { unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; unsigned int s_max_dir_size_kb; - /* where last allocation was done - for stream allocation */ - unsigned long s_mb_last_group; - unsigned long s_mb_last_start; unsigned int s_mb_prefetch; unsigned int s_mb_prefetch_limit; unsigned int s_mb_best_avail_max_trim_order; unsigned int s_sb_update_sec; unsigned int s_sb_update_kb; + /* where last allocation was done - for stream allocation */ + ext4_group_t *s_mb_last_groups; + unsigned int s_mb_nr_global_goals; + /* stats for buddy allocator */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ atomic_t s_bal_success; /* we found long enough chunks */ @@ -1646,12 +1630,10 @@ struct ext4_sb_info { atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */ atomic_t s_bal_groups_scanned; /* number of groups scanned */ atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_stream_goals; /* stream allocation global goal hits */ atomic_t s_bal_len_goals; /* len goal hits */ atomic_t s_bal_breaks; /* too long searches */ atomic_t s_bal_2orders; /* 2^order hits */ - atomic_t s_bal_p2_aligned_bad_suggestions; - atomic_t s_bal_goal_fast_bad_suggestions; - atomic_t s_bal_best_avail_bad_suggestions; atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */ @@ -3020,7 +3002,7 @@ int ext4_walk_page_buffers(handle_t *handle, struct buffer_head *bh)); int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh); -bool ext4_should_enable_large_folio(struct inode *inode); +void ext4_set_inode_mapping_order(struct inode *inode); #define FALL_BACK_TO_NONDELALLOC 1 #define CONVERT_INLINE_DATA 2 @@ -3064,9 +3046,9 @@ extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); -extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_chunk_trans_extent(struct inode *inode, int nrblocks); extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, @@ -3103,8 +3085,8 @@ extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); int ext4_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); -int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); +int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa); extern void ext4_reset_inode_seed(struct inode *inode); int ext4_update_overhead(struct super_block *sb, bool force); int ext4_force_shutdown(struct super_block *sb, u32 flags); @@ -3489,8 +3471,6 @@ struct ext4_group_info { void *bb_bitmap; #endif struct rw_semaphore alloc_sem; - struct list_head bb_avg_fragment_size_node; - struct list_head bb_largest_free_order_node; ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block * regions, index is order. * bb_counters[3] = 5 means @@ -3541,23 +3521,28 @@ static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); } +static inline bool ext4_try_lock_group(struct super_block *sb, ext4_group_t group) +{ + if (!spin_trylock(ext4_group_lock_ptr(sb, group))) + return false; + /* + * We're able to grab the lock right away, so drop the lock + * contention counter. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); + return true; +} + static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) { - spinlock_t *lock = ext4_group_lock_ptr(sb, group); - if (spin_trylock(lock)) - /* - * We're able to grab the lock right away, so drop the - * lock contention counter. - */ - atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); - else { + if (!ext4_try_lock_group(sb, group)) { /* * The lock is busy, so bump the contention counter, * and then wait on the spin lock. */ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, EXT4_MAX_CONTENTION); - spin_lock(lock); + spin_lock(ext4_group_lock_ptr(sb, group)); } } @@ -3612,6 +3597,7 @@ extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); extern int ext4_get_max_inline_size(struct inode *inode); extern int ext4_find_inline_data_nolock(struct inode *inode); extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); +extern void ext4_update_final_de(void *de_buf, int old_size, int new_size); int ext4_readpage_inline(struct inode *inode, struct folio *folio); extern int ext4_try_to_write_inline_data(struct address_space *mapping, @@ -3671,10 +3657,10 @@ static inline int ext4_has_inline_data(struct inode *inode) extern const struct inode_operations ext4_dir_inode_operations; extern const struct inode_operations ext4_special_inode_operations; extern struct dentry *ext4_get_parent(struct dentry *child); -extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, - struct ext4_dir_entry_2 *de, - int blocksize, int csum_size, - unsigned int parent_ino, int dotdot_real_len); +extern int ext4_init_dirblock(handle_t *handle, struct inode *inode, + struct buffer_head *dir_block, + unsigned int parent_ino, void *inline_buf, + int inline_size); extern void ext4_initialize_dirent_tail(struct buffer_head *bh, unsigned int blocksize); extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 26435f3a3094..c484125d963f 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -31,13 +31,6 @@ #define CHECK_BINSEARCH__ /* - * If EXT_STATS is defined then stats numbers are collected. - * These number will be displayed at umount time. - */ -#define EXT_STATS_ - - -/* * ext4_inode has i_block array (60 bytes total). * The first 12 bytes store ext4_extent_header; * the remainder stores an array of ext4_extent. diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b543a46fc809..ca5499e9412b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4501,6 +4501,8 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, struct ext4_map_blocks map; unsigned int credits; loff_t epos, old_size = i_size_read(inode); + unsigned int blkbits = inode->i_blkbits; + bool alloc_zero = false; BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)); map.m_lblk = offset; @@ -4514,6 +4516,17 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; /* + * Do the actual write zero during a running journal transaction + * costs a lot. First allocate an unwritten extent and then + * convert it to written after zeroing it out. + */ + if (flags & EXT4_GET_BLOCKS_ZERO) { + flags &= ~EXT4_GET_BLOCKS_ZERO; + flags |= EXT4_GET_BLOCKS_UNWRIT_EXT; + alloc_zero = true; + } + + /* * credits to insert 1 extent into extent tree */ credits = ext4_chunk_trans_blocks(inode, len); @@ -4549,9 +4562,7 @@ retry: * allow a full retry cycle for any remaining allocations */ retries = 0; - map.m_lblk += ret; - map.m_len = len = len - ret; - epos = (loff_t)map.m_lblk << inode->i_blkbits; + epos = (loff_t)(map.m_lblk + ret) << blkbits; inode_set_ctime_current(inode); if (new_size) { if (epos > new_size) @@ -4571,6 +4582,21 @@ retry: ret2 = ret3 ? ret3 : ret2; if (unlikely(ret2)) break; + + if (alloc_zero && + (map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) { + ret2 = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk, + map.m_len); + if (likely(!ret2)) + ret2 = ext4_convert_unwritten_extents(NULL, + inode, (loff_t)map.m_lblk << blkbits, + (loff_t)map.m_len << blkbits); + if (ret2) + break; + } + + map.m_lblk += ret; + map.m_len = len = len - ret; } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -4636,7 +4662,11 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (end_lblk > start_lblk) { ext4_lblk_t zero_blks = end_lblk - start_lblk; - flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE); + if (mode & FALLOC_FL_WRITE_ZEROES) + flags = EXT4_GET_BLOCKS_CREATE_ZERO | EXT4_EX_NOCACHE; + else + flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | + EXT4_EX_NOCACHE); ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks, new_size, flags); if (ret) @@ -4745,11 +4775,18 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (IS_ENCRYPTED(inode) && (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; + /* + * Don't allow writing zeroes if the underlying device does not + * enable the unmap write zeroes operation. + */ + if ((mode & FALLOC_FL_WRITE_ZEROES) && + !bdev_write_zeroes_unmap_sectors(inode->i_sb->s_bdev)) + return -EOPNOTSUPP; /* Return error if mode is not supported */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | - FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | - FALLOC_FL_INSERT_RANGE)) + FALLOC_FL_ZERO_RANGE | FALLOC_FL_COLLAPSE_RANGE | + FALLOC_FL_INSERT_RANGE | FALLOC_FL_WRITE_ZEROES)) return -EOPNOTSUPP; inode_lock(inode); @@ -4780,16 +4817,23 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (ret) goto out_invalidate_lock; - if (mode & FALLOC_FL_PUNCH_HOLE) + switch (mode & FALLOC_FL_MODE_MASK) { + case FALLOC_FL_PUNCH_HOLE: ret = ext4_punch_hole(file, offset, len); - else if (mode & FALLOC_FL_COLLAPSE_RANGE) + break; + case FALLOC_FL_COLLAPSE_RANGE: ret = ext4_collapse_range(file, offset, len); - else if (mode & FALLOC_FL_INSERT_RANGE) + break; + case FALLOC_FL_INSERT_RANGE: ret = ext4_insert_range(file, offset, len); - else if (mode & FALLOC_FL_ZERO_RANGE) + break; + case FALLOC_FL_ZERO_RANGE: + case FALLOC_FL_WRITE_ZEROES: ret = ext4_zero_range(file, offset, len, mode); - else + break; + default: ret = -EOPNOTSUPP; + } out_invalidate_lock: filemap_invalidate_unlock(mapping); @@ -5171,7 +5215,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, credits = depth + 2; } - restart_credits = ext4_writepage_trans_blocks(inode); + restart_credits = ext4_chunk_trans_extent(inode, 0); err = ext4_datasem_ensure_credits(handle, inode, credits, restart_credits, 0); if (err) { @@ -5431,7 +5475,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) truncate_pagecache(inode, start); - credits = ext4_writepage_trans_blocks(inode); + credits = ext4_chunk_trans_extent(inode, 0); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -5527,7 +5571,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) truncate_pagecache(inode, start); - credits = ext4_writepage_trans_blocks(inode); + credits = ext4_chunk_trans_extent(inode, 0); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) return PTR_ERR(handle); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 21df81347147..93240e35ee36 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -747,7 +747,7 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order) bool write = (vmf->flags & FAULT_FLAG_WRITE) && (vmf->vma->vm_flags & VM_SHARED); struct address_space *mapping = vmf->vma->vm_file->f_mapping; - pfn_t pfn; + unsigned long pfn; if (write) { sb_start_pagefault(sb); @@ -804,9 +804,10 @@ static const struct vm_operations_struct ext4_file_vm_ops = { .page_mkwrite = ext4_page_mkwrite, }; -static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) +static int ext4_file_mmap_prepare(struct vm_area_desc *desc) { int ret; + struct file *file = desc->file; struct inode *inode = file->f_mapping->host; struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; @@ -821,15 +822,15 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) * We don't support synchronous mappings for non-DAX files and * for DAX files if underneath dax_device is not synchronous. */ - if (!daxdev_mapping_supported(vma, dax_dev)) + if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), dax_dev)) return -EOPNOTSUPP; file_accessed(file); if (IS_DAX(file_inode(file))) { - vma->vm_ops = &ext4_dax_vm_ops; - vm_flags_set(vma, VM_HUGEPAGE); + desc->vm_ops = &ext4_dax_vm_ops; + desc->vm_flags |= VM_HUGEPAGE; } else { - vma->vm_ops = &ext4_file_vm_ops; + desc->vm_ops = &ext4_file_vm_ops; } return 0; } @@ -968,7 +969,7 @@ const struct file_operations ext4_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif - .mmap = ext4_file_mmap, + .mmap_prepare = ext4_file_mmap_prepare, .open = ext4_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, @@ -977,7 +978,8 @@ const struct file_operations ext4_file_operations = { .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | - FOP_DIO_PARALLEL_WRITE, + FOP_DIO_PARALLEL_WRITE | + FOP_DONTCACHE, }; const struct inode_operations ext4_file_inode_operations = { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 79aa3df8d019..df4051613b29 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1335,8 +1335,7 @@ got: } } - if (ext4_should_enable_large_folio(inode)) - mapping_set_large_folios(inode->i_mapping); + ext4_set_inode_mapping_order(inode); ext4_update_inode_fsync_trans(handle, inode, 1); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index a1bbcdf40824..1b094a4f3866 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -303,7 +303,11 @@ static int ext4_create_inline_data(handle_t *handle, if (error) goto out; - BUG_ON(!is.s.not_found); + if (!is.s.not_found) { + EXT4_ERROR_INODE(inode, "unexpected inline data xattr"); + error = -EFSCORRUPTED; + goto out; + } error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (error) { @@ -354,7 +358,11 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, if (error) goto out; - BUG_ON(is.s.not_found); + if (is.s.not_found) { + EXT4_ERROR_INODE(inode, "missing inline data xattr"); + error = -EFSCORRUPTED; + goto out; + } len -= EXT4_MIN_INLINE_DATA_SIZE; value = kzalloc(len, GFP_NOFS); @@ -562,7 +570,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping, return 0; } - needed_blocks = ext4_writepage_trans_blocks(inode); + needed_blocks = ext4_chunk_trans_extent(inode, 1); ret = ext4_get_inode_loc(inode, &iloc); if (ret) @@ -612,6 +620,7 @@ retry: } else ret = ext4_block_write_begin(handle, folio, from, to, ext4_get_block); + clear_buffer_new(folio_buffers(folio)); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, @@ -891,6 +900,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, return ret; } + clear_buffer_new(folio_buffers(folio)); folio_mark_dirty(folio); folio_mark_uptodate(folio); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); @@ -995,7 +1005,7 @@ static void *ext4_get_inline_xattr_pos(struct inode *inode, } /* Set the final de to cover the whole block. */ -static void ext4_update_final_de(void *de_buf, int old_size, int new_size) +void ext4_update_final_de(void *de_buf, int old_size, int new_size) { struct ext4_dir_entry_2 *de, *prev_de; void *limit; @@ -1059,51 +1069,6 @@ static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); } -static int ext4_finish_convert_inline_dir(handle_t *handle, - struct inode *inode, - struct buffer_head *dir_block, - void *buf, - int inline_size) -{ - int err, csum_size = 0, header_size = 0; - struct ext4_dir_entry_2 *de; - void *target = dir_block->b_data; - - /* - * First create "." and ".." and then copy the dir information - * back to the block. - */ - de = target; - de = ext4_init_dot_dotdot(inode, de, - inode->i_sb->s_blocksize, csum_size, - le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1); - header_size = (void *)de - target; - - memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, - inline_size - EXT4_INLINE_DOTDOT_SIZE); - - if (ext4_has_feature_metadata_csum(inode->i_sb)) - csum_size = sizeof(struct ext4_dir_entry_tail); - - inode->i_size = inode->i_sb->s_blocksize; - i_size_write(inode, inode->i_sb->s_blocksize); - EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; - ext4_update_final_de(dir_block->b_data, - inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size, - inode->i_sb->s_blocksize - csum_size); - - if (csum_size) - ext4_initialize_dirent_tail(dir_block, - inode->i_sb->s_blocksize); - set_buffer_uptodate(dir_block); - unlock_buffer(dir_block); - err = ext4_handle_dirty_dirblock(handle, inode, dir_block); - if (err) - return err; - set_buffer_verified(dir_block); - return ext4_mark_inode_dirty(handle, inode); -} - static int ext4_convert_inline_data_nolock(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) @@ -1175,8 +1140,17 @@ static int ext4_convert_inline_data_nolock(handle_t *handle, error = ext4_handle_dirty_metadata(handle, inode, data_bh); } else { - error = ext4_finish_convert_inline_dir(handle, inode, data_bh, - buf, inline_size); + unlock_buffer(data_bh); + inode->i_size = inode->i_sb->s_blocksize; + i_size_write(inode, inode->i_sb->s_blocksize); + EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; + + error = ext4_init_dirblock(handle, inode, data_bh, + le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), + buf + EXT4_INLINE_DOTDOT_SIZE, + inline_size - EXT4_INLINE_DOTDOT_SIZE); + if (!error) + error = ext4_mark_inode_dirty(handle, inode); } out_restore: @@ -1315,7 +1289,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file, if (pos == 0) { fake.inode = cpu_to_le32(inode->i_ino); fake.name_len = 1; - strcpy(fake.name, "."); + memcpy(fake.name, ".", 2); fake.rec_len = ext4_rec_len_to_disk( ext4_dir_rec_len(fake.name_len, NULL), inline_size); @@ -1325,7 +1299,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file, } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) { fake.inode = cpu_to_le32(parent_ino); fake.name_len = 2; - strcpy(fake.name, ".."); + memcpy(fake.name, "..", 3); fake.rec_len = ext4_rec_len_to_disk( ext4_dir_rec_len(fake.name_len, NULL), inline_size); @@ -1864,7 +1838,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline) }; - needed_blocks = ext4_writepage_trans_blocks(inode); + needed_blocks = ext4_chunk_trans_extent(inode, 1); handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1903,7 +1877,12 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline) if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0) goto out_error; - BUG_ON(is.s.not_found); + if (is.s.not_found) { + EXT4_ERROR_INODE(inode, + "missing inline data xattr"); + err = -EFSCORRUPTED; + goto out_error; + } value_len = le32_to_cpu(is.s.here->e_value_size); value = kmalloc(value_len, GFP_NOFS); @@ -1979,7 +1958,7 @@ int ext4_convert_inline_data(struct inode *inode) return 0; } - needed_blocks = ext4_writepage_trans_blocks(inode); + needed_blocks = ext4_chunk_trans_extent(inode, 1); iloc.bh = NULL; error = ext4_get_inode_loc(inode, &iloc); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index be9a4cba35fd..ed54c4d0f2f9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -723,8 +723,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, ext4_check_map_extents_env(inode); /* Lookup extent status tree firstly */ - if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) && - ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; @@ -757,8 +756,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, orig_mlen == map->m_len) goto found; - if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) - map->m_len = orig_mlen; + map->m_len = orig_mlen; } /* * In the query cache no-wait mode, nothing we can do more if we @@ -877,6 +875,26 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state))); } +/* + * Make sure that the current journal transaction has enough credits to map + * one extent. Return -EAGAIN if it cannot extend the current running + * transaction. + */ +static inline int ext4_journal_ensure_extent_credits(handle_t *handle, + struct inode *inode) +{ + int credits; + int ret; + + /* Called from ext4_da_write_begin() which has no handle started? */ + if (!handle) + return 0; + + credits = ext4_chunk_trans_blocks(inode, 1); + ret = __ext4_journal_ensure_credits(handle, credits, credits, 0); + return ret <= 0 ? ret : -EAGAIN; +} + static int _ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int flags) { @@ -1171,11 +1189,13 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, } continue; } - if (buffer_new(bh)) + if (WARN_ON_ONCE(buffer_new(bh))) clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); - err = get_block(inode, block, bh, 1); + err = ext4_journal_ensure_extent_credits(handle, inode); + if (!err) + err = get_block(inode, block, bh, 1); if (err) break; if (buffer_new(bh)) { @@ -1252,7 +1272,8 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, * and the ext4_write_end(). So doing the jbd2_journal_start at the start of * ext4_write_begin() is the right place. */ -static int ext4_write_begin(struct file *file, struct address_space *mapping, +static int ext4_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -1263,7 +1284,6 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, struct folio *folio; pgoff_t index; unsigned from, to; - fgf_t fgp = FGP_WRITEBEGIN; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) @@ -1274,7 +1294,8 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, * Reserve one block more for addition to orphan list in case * we allocate blocks but write fails for some reason */ - needed_blocks = ext4_writepage_trans_blocks(inode) + 1; + needed_blocks = ext4_chunk_trans_extent(inode, + ext4_journal_blocks_per_folio(inode)) + 1; index = pos >> PAGE_SHIFT; if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { @@ -1287,16 +1308,14 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, } /* - * __filemap_get_folio() can take a long time if the + * write_begin_get_folio() can take a long time if the * system is thrashing due to memory pressure, or if the folio * is being written back. So grab it first before we start * the transaction handle. This also allows us to allocate * the folio (if needed) without using GFP_NOFS. */ retry_grab: - fgp |= fgf_set_order(len); - folio = __filemap_get_folio(mapping, index, fgp, - mapping_gfp_mask(mapping)); + folio = write_begin_get_folio(iocb, mapping, index, len); if (IS_ERR(folio)) return PTR_ERR(folio); @@ -1374,8 +1393,9 @@ retry_journal: ext4_orphan_del(NULL, inode); } - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) + if (ret == -EAGAIN || + (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries))) goto retry_journal; folio_put(folio); return ret; @@ -1395,17 +1415,18 @@ static int write_end_fn(handle_t *handle, struct inode *inode, ret = ext4_dirty_journalled_data(handle, bh); clear_buffer_meta(bh); clear_buffer_prio(bh); + clear_buffer_new(bh); return ret; } /* * We need to pick up the new inode size which generic_commit_write gave us - * `file' can be NULL - eg, when called from page_symlink(). + * `iocb` can be NULL - eg, when called from page_symlink(). * * ext4 never places buffers on inode->i_mapping->i_private_list. metadata * buffers are managed internally. */ -static int ext4_write_end(struct file *file, +static int ext4_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) @@ -1424,7 +1445,7 @@ static int ext4_write_end(struct file *file, return ext4_write_inline_data_end(inode, pos, len, copied, folio); - copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata); + copied = block_write_end(pos, len, copied, folio); /* * it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. @@ -1510,7 +1531,7 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, } while (bh != head); } -static int ext4_journalled_write_end(struct file *file, +static int ext4_journalled_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) @@ -1667,11 +1688,12 @@ struct mpage_da_data { unsigned int can_map:1; /* Can writepages call map blocks? */ /* These are internal state of ext4_do_writepages() */ - pgoff_t first_page; /* The first page to write */ - pgoff_t next_page; /* Current page to examine */ - pgoff_t last_page; /* Last page to examine */ + loff_t start_pos; /* The start pos to write */ + loff_t next_pos; /* Current pos to examine */ + loff_t end_pos; /* Last pos to examine */ + /* - * Extent to map - this can be after first_page because that can be + * Extent to map - this can be after start_pos because that can be * fully mapped. We somewhat abuse m_flags to store whether the extent * is delalloc or unwritten. */ @@ -1691,38 +1713,38 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; - /* This is necessary when next_page == 0. */ - if (mpd->first_page >= mpd->next_page) + /* This is necessary when next_pos == 0. */ + if (mpd->start_pos >= mpd->next_pos) return; mpd->scanned_until_end = 0; - index = mpd->first_page; - end = mpd->next_page - 1; if (invalidate) { ext4_lblk_t start, last; - start = index << (PAGE_SHIFT - inode->i_blkbits); - last = end << (PAGE_SHIFT - inode->i_blkbits); + start = EXT4_B_TO_LBLK(inode, mpd->start_pos); + last = mpd->next_pos >> inode->i_blkbits; /* * avoid racing with extent status tree scans made by * ext4_insert_delayed_block() */ down_write(&EXT4_I(inode)->i_data_sem); - ext4_es_remove_extent(inode, start, last - start + 1); + ext4_es_remove_extent(inode, start, last - start); up_write(&EXT4_I(inode)->i_data_sem); } folio_batch_init(&fbatch); - while (index <= end) { - nr = filemap_get_folios(mapping, &index, end, &fbatch); + index = mpd->start_pos >> PAGE_SHIFT; + end = mpd->next_pos >> PAGE_SHIFT; + while (index < end) { + nr = filemap_get_folios(mapping, &index, end - 1, &fbatch); if (nr == 0) break; for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; - if (folio->index < mpd->first_page) + if (folio_pos(folio) < mpd->start_pos) continue; - if (folio_next_index(folio) - 1 > end) + if (folio_next_index(folio) > end) continue; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); @@ -2024,7 +2046,8 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio) { - mpd->first_page += folio_nr_pages(folio); + mpd->start_pos += folio_size(folio); + mpd->wbc->nr_to_write -= folio_nr_pages(folio); folio_unlock(folio); } @@ -2034,7 +2057,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) loff_t size; int err; - BUG_ON(folio->index != mpd->first_page); + WARN_ON_ONCE(folio_pos(folio) != mpd->start_pos); folio_clear_dirty_for_io(folio); /* * We have to be very careful here! Nothing protects writeback path @@ -2055,8 +2078,6 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) !ext4_verity_in_progress(mpd->inode)) len = size & (len - 1); err = ext4_bio_write_folio(&mpd->io_submit, folio, len); - if (!err) - mpd->wbc->nr_to_write -= folio_nr_pages(folio); return err; } @@ -2323,6 +2344,11 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) int get_blocks_flags; int err, dioread_nolock; + /* Make sure transaction has enough credits for this extent */ + err = ext4_journal_ensure_extent_credits(handle, inode); + if (err < 0) + return err; + trace_ext4_da_write_pages_extent(inode, map); /* * Call ext4_map_blocks() to allocate any delayed allocation blocks, or @@ -2362,6 +2388,47 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) } /* + * This is used to submit mapped buffers in a single folio that is not fully + * mapped for various reasons, such as insufficient space or journal credits. + */ +static int mpage_submit_partial_folio(struct mpage_da_data *mpd) +{ + struct inode *inode = mpd->inode; + struct folio *folio; + loff_t pos; + int ret; + + folio = filemap_get_folio(inode->i_mapping, + mpd->start_pos >> PAGE_SHIFT); + if (IS_ERR(folio)) + return PTR_ERR(folio); + /* + * The mapped position should be within the current processing folio + * but must not be the folio start position. + */ + pos = ((loff_t)mpd->map.m_lblk) << inode->i_blkbits; + if (WARN_ON_ONCE((folio_pos(folio) == pos) || + !folio_contains(folio, pos >> PAGE_SHIFT))) + return -EINVAL; + + ret = mpage_submit_folio(mpd, folio); + if (ret) + goto out; + /* + * Update start_pos to prevent this folio from being released in + * mpage_release_unused_pages(), it will be reset to the aligned folio + * pos when this folio is written again in the next round. Additionally, + * do not update wbc->nr_to_write here, as it will be updated once the + * entire folio has finished processing. + */ + mpd->start_pos = pos; +out: + folio_unlock(folio); + folio_put(folio); + return ret; +} + +/* * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length * mpd->len and submit pages underlying it for IO * @@ -2409,10 +2476,18 @@ static int mpage_map_and_submit_extent(handle_t *handle, * In the case of ENOSPC, if ext4_count_free_blocks() * is non-zero, a commit should free up blocks. */ - if ((err == -ENOMEM) || + if ((err == -ENOMEM) || (err == -EAGAIN) || (err == -ENOSPC && ext4_count_free_clusters(sb))) { - if (progress) + /* + * We may have already allocated extents for + * some bhs inside the folio, issue the + * corresponding data to prevent stale data. + */ + if (progress) { + if (mpage_submit_partial_folio(mpd)) + goto invalidate_dirty_pages; goto update_disksize; + } return err; } ext4_msg(sb, KERN_CRIT, @@ -2446,7 +2521,7 @@ update_disksize: * Update on-disk size after IO is submitted. Races with * truncate are avoided by checking i_size under i_data_sem. */ - disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT; + disksize = mpd->start_pos; if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) { int err2; loff_t i_size; @@ -2470,21 +2545,6 @@ update_disksize: return err; } -/* - * Calculate the total number of credits to reserve for one writepages - * iteration. This is called from ext4_writepages(). We map an extent of - * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping - * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + - * bpp - 1 blocks in bpp different extents. - */ -static int ext4_da_writepages_trans_blocks(struct inode *inode) -{ - int bpp = ext4_journal_blocks_per_folio(inode); - - return ext4_meta_trans_blocks(inode, - MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); -} - static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio, size_t len) { @@ -2549,8 +2609,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) struct address_space *mapping = mpd->inode->i_mapping; struct folio_batch fbatch; unsigned int nr_folios; - pgoff_t index = mpd->first_page; - pgoff_t end = mpd->last_page; + pgoff_t index = mpd->start_pos >> PAGE_SHIFT; + pgoff_t end = mpd->end_pos >> PAGE_SHIFT; xa_mark_t tag; int i, err = 0; int blkbits = mpd->inode->i_blkbits; @@ -2565,7 +2625,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) tag = PAGECACHE_TAG_DIRTY; mpd->map.m_len = 0; - mpd->next_page = index; + mpd->next_pos = mpd->start_pos; if (ext4_should_journal_data(mpd->inode)) { handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, bpp); @@ -2596,7 +2656,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) goto out; /* If we can't merge this page, we are done. */ - if (mpd->map.m_len > 0 && mpd->next_page != folio->index) + if (mpd->map.m_len > 0 && + mpd->next_pos != folio_pos(folio)) goto out; if (handle) { @@ -2642,8 +2703,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) } if (mpd->map.m_len == 0) - mpd->first_page = folio->index; - mpd->next_page = folio_next_index(folio); + mpd->start_pos = folio_pos(folio); + mpd->next_pos = folio_pos(folio) + folio_size(folio); /* * Writeout when we cannot modify metadata is simple. * Just submit the page. For data=journal mode we @@ -2771,12 +2832,12 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) mpd->journalled_more_data = 0; if (ext4_should_dioread_nolock(inode)) { + int bpf = ext4_journal_blocks_per_folio(inode); /* * We may need to convert up to one extent per block in - * the page and we may dirty the inode. + * the folio and we may dirty the inode. */ - rsv_blocks = 1 + ext4_chunk_trans_blocks(inode, - PAGE_SIZE >> inode->i_blkbits); + rsv_blocks = 1 + ext4_ext_index_trans_blocks(inode, bpf); } if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) @@ -2786,18 +2847,18 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) writeback_index = mapping->writeback_index; if (writeback_index) cycled = 0; - mpd->first_page = writeback_index; - mpd->last_page = -1; + mpd->start_pos = writeback_index << PAGE_SHIFT; + mpd->end_pos = LLONG_MAX; } else { - mpd->first_page = wbc->range_start >> PAGE_SHIFT; - mpd->last_page = wbc->range_end >> PAGE_SHIFT; + mpd->start_pos = wbc->range_start; + mpd->end_pos = wbc->range_end; } ext4_io_submit_init(&mpd->io_submit, wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag_pages_for_writeback(mapping, mpd->first_page, - mpd->last_page); + tag_pages_for_writeback(mapping, mpd->start_pos >> PAGE_SHIFT, + mpd->end_pos >> PAGE_SHIFT); blk_start_plug(&plug); /* @@ -2840,8 +2901,14 @@ retry: * not supported by delalloc. */ BUG_ON(ext4_should_journal_data(inode)); - needed_blocks = ext4_da_writepages_trans_blocks(inode); - + /* + * Calculate the number of credits needed to reserve for one + * extent of up to MAX_WRITEPAGES_EXTENT_LEN blocks. It will + * attempt to extend the transaction or start a new iteration + * if the reserved credits are insufficient. + */ + needed_blocks = ext4_chunk_trans_blocks(inode, + MAX_WRITEPAGES_EXTENT_LEN); /* start a new transaction */ handle = ext4_journal_start_with_reserve(inode, EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); @@ -2857,7 +2924,8 @@ retry: } mpd->do_map = 1; - trace_ext4_da_write_pages(inode, mpd->first_page, wbc); + trace_ext4_da_write_folios_start(inode, mpd->start_pos, + mpd->next_pos, wbc); ret = mpage_prepare_extent_to_map(mpd); if (!ret && mpd->map.m_len) ret = mpage_map_and_submit_extent(handle, mpd, @@ -2895,6 +2963,8 @@ retry: } else ext4_put_io_end(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; + trace_ext4_da_write_folios_end(inode, mpd->start_pos, + mpd->next_pos, wbc, ret); if (ret == -ENOSPC && sbi->s_journal) { /* @@ -2906,6 +2976,8 @@ retry: ret = 0; continue; } + if (ret == -EAGAIN) + ret = 0; /* Fatal error - ENOMEM, EIO... */ if (ret) break; @@ -2914,8 +2986,8 @@ unplug: blk_finish_plug(&plug); if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; - mpd->last_page = writeback_index - 1; - mpd->first_page = 0; + mpd->end_pos = (writeback_index << PAGE_SHIFT) - 1; + mpd->start_pos = 0; goto retry; } @@ -2925,7 +2997,7 @@ unplug: * Set the writeback_index so that range_cyclic * mode will write it back later */ - mapping->writeback_index = mpd->first_page; + mapping->writeback_index = mpd->start_pos >> PAGE_SHIFT; out_writepages: trace_ext4_writepages_result(inode, wbc, ret, @@ -3036,7 +3108,8 @@ static int ext4_nonda_switch(struct super_block *sb) return 0; } -static int ext4_da_write_begin(struct file *file, struct address_space *mapping, +static int ext4_da_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -3044,7 +3117,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, struct folio *folio; pgoff_t index; struct inode *inode = mapping->host; - fgf_t fgp = FGP_WRITEBEGIN; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) @@ -3054,7 +3126,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; - return ext4_write_begin(file, mapping, pos, + return ext4_write_begin(iocb, mapping, pos, len, foliop, fsdata); } *fsdata = (void *)0; @@ -3070,9 +3142,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, } retry: - fgp |= fgf_set_order(len); - folio = __filemap_get_folio(mapping, index, fgp, - mapping_gfp_mask(mapping)); + folio = write_begin_get_folio(iocb, mapping, index, len); if (IS_ERR(folio)) return PTR_ERR(folio); @@ -3144,8 +3214,7 @@ static int ext4_da_do_write_end(struct address_space *mapping, * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES * flag, which all that's needed to trigger page writeback. */ - copied = block_write_end(NULL, mapping, pos, len, copied, - folio, NULL); + copied = block_write_end(pos, len, copied, folio); new_i_size = pos + copied; /* @@ -3196,7 +3265,7 @@ static int ext4_da_do_write_end(struct address_space *mapping, return copied; } -static int ext4_da_write_end(struct file *file, +static int ext4_da_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) @@ -3205,7 +3274,7 @@ static int ext4_da_write_end(struct file *file, int write_mode = (int)(unsigned long)fsdata; if (write_mode == FALL_BACK_TO_NONDELALLOC) - return ext4_write_end(file, mapping, pos, + return ext4_write_end(iocb, mapping, pos, len, copied, folio, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); @@ -4389,7 +4458,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) return ret; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - credits = ext4_writepage_trans_blocks(inode); + credits = ext4_chunk_trans_extent(inode, 2); else credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); @@ -4538,7 +4607,7 @@ int ext4_truncate(struct inode *inode) } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - credits = ext4_writepage_trans_blocks(inode); + credits = ext4_chunk_trans_extent(inode, 1); else credits = ext4_blocks_for_truncate(inode); @@ -5106,7 +5175,7 @@ error: return -EFSCORRUPTED; } -bool ext4_should_enable_large_folio(struct inode *inode) +static bool ext4_should_enable_large_folio(struct inode *inode) { struct super_block *sb = inode->i_sb; @@ -5123,6 +5192,22 @@ bool ext4_should_enable_large_folio(struct inode *inode) return true; } +/* + * Limit the maximum folio order to 2048 blocks to prevent overestimation + * of reserve handle credits during the folio writeback in environments + * where the PAGE_SIZE exceeds 4KB. + */ +#define EXT4_MAX_PAGECACHE_ORDER(i) \ + umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT)) +void ext4_set_inode_mapping_order(struct inode *inode) +{ + if (!ext4_should_enable_large_folio(inode)) + return; + + mapping_set_folio_order_range(inode->i_mapping, 0, + EXT4_MAX_PAGECACHE_ORDER(inode)); +} + struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line) @@ -5440,8 +5525,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ret = -EFSCORRUPTED; goto bad_inode; } - if (ext4_should_enable_large_folio(inode)) - mapping_set_large_folios(inode->i_mapping); + + ext4_set_inode_mapping_order(inode); ret = check_igot_inode(inode, flags, function, line); /* @@ -6139,7 +6224,7 @@ int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) int ret; /* - * How many index and lead blocks need to touch to map @lblocks + * How many index and leaf blocks need to touch to map @lblocks * logical blocks to @pextents physical extents? */ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); @@ -6148,7 +6233,7 @@ int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) * Now let's see how many group bitmaps and group descriptors need * to account */ - groups = idxblocks; + groups = idxblocks + pextents; gdpblocks = groups; if (groups > ngroups) groups = ngroups; @@ -6165,25 +6250,19 @@ int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) } /* - * Calculate the total number of credits to reserve to fit - * the modification of a single pages into a single transaction, - * which may include multiple chunks of block allocations. - * - * This could be called via ext4_write_begin() - * - * We need to consider the worse case, when - * one new block per extent. + * Calculate the journal credits for modifying the number of blocks + * in a single extent within one transaction. 'nrblocks' is used only + * for non-extent inodes. For extent type inodes, 'nrblocks' can be + * zero if the exact number of blocks is unknown. */ -int ext4_writepage_trans_blocks(struct inode *inode) +int ext4_chunk_trans_extent(struct inode *inode, int nrblocks) { - int bpp = ext4_journal_blocks_per_folio(inode); int ret; - ret = ext4_meta_trans_blocks(inode, bpp, bpp); - + ret = ext4_meta_trans_blocks(inode, nrblocks, 1); /* Account for data blocks for journalled mode */ if (ext4_should_journal_data(inode)) - ret += bpp; + ret += nrblocks; return ret; } @@ -6555,6 +6634,55 @@ static int ext4_bh_unmapped(handle_t *handle, struct inode *inode, return !buffer_mapped(bh); } +static int ext4_block_page_mkwrite(struct inode *inode, struct folio *folio, + get_block_t get_block) +{ + handle_t *handle; + loff_t size; + unsigned long len; + int credits; + int ret; + + credits = ext4_chunk_trans_extent(inode, + ext4_journal_blocks_per_folio(inode)); + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + folio_lock(folio); + size = i_size_read(inode); + /* Page got truncated from under us? */ + if (folio->mapping != inode->i_mapping || folio_pos(folio) > size) { + ret = -EFAULT; + goto out_error; + } + + len = folio_size(folio); + if (folio_pos(folio) + len > size) + len = size - folio_pos(folio); + + ret = ext4_block_write_begin(handle, folio, 0, len, get_block); + if (ret) + goto out_error; + + if (!ext4_should_journal_data(inode)) { + block_commit_write(folio, 0, len); + folio_mark_dirty(folio); + } else { + ret = ext4_journal_folio_buffers(handle, folio, len); + if (ret) + goto out_error; + } + ext4_journal_stop(handle); + folio_wait_stable(folio); + return ret; + +out_error: + folio_unlock(folio); + ext4_journal_stop(handle); + return ret; +} + vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -6566,8 +6694,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) struct file *file = vma->vm_file; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; - handle_t *handle; - get_block_t *get_block; + get_block_t *get_block = ext4_get_block; int retries = 0; if (unlikely(IS_IMMUTABLE(inode))) @@ -6635,47 +6762,11 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) /* OK, we need to fill the hole... */ if (ext4_should_dioread_nolock(inode)) get_block = ext4_get_block_unwritten; - else - get_block = ext4_get_block; retry_alloc: - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, - ext4_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = VM_FAULT_SIGBUS; - goto out; - } - /* - * Data journalling can't use block_page_mkwrite() because it - * will set_buffer_dirty() before do_journal_get_write_access() - * thus might hit warning messages for dirty metadata buffers. - */ - if (!ext4_should_journal_data(inode)) { - err = block_page_mkwrite(vma, vmf, get_block); - } else { - folio_lock(folio); - size = i_size_read(inode); - /* Page got truncated from under us? */ - if (folio->mapping != mapping || folio_pos(folio) > size) { - ret = VM_FAULT_NOPAGE; - goto out_error; - } - - len = folio_size(folio); - if (folio_pos(folio) + len > size) - len = size - folio_pos(folio); - - err = ext4_block_write_begin(handle, folio, 0, len, - ext4_get_block); - if (!err) { - ret = VM_FAULT_SIGBUS; - if (ext4_journal_folio_buffers(handle, folio, len)) - goto out_error; - } else { - folio_unlock(folio); - } - } - ext4_journal_stop(handle); - if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + /* Start journal and allocate blocks */ + err = ext4_block_page_mkwrite(inode, folio, get_block); + if (err == -EAGAIN || + (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))) goto retry_alloc; out_ret: ret = vmf_fs_error(err); @@ -6683,8 +6774,4 @@ out: filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(inode->i_sb); return ret; -out_error: - folio_unlock(folio); - ext4_journal_stop(handle); - goto out; } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 5668a17458ae..84e3c73952d7 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -980,7 +980,7 @@ group_add_out: return err; } -int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct ext4_inode_info *ei = EXT4_I(inode); @@ -997,7 +997,7 @@ int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa) } int ext4_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); u32 flags = fa->flags; diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c index d634c12f1984..a9416b20ff64 100644 --- a/fs/ext4/mballoc-test.c +++ b/fs/ext4/mballoc-test.c @@ -155,6 +155,7 @@ static struct super_block *mbt_ext4_alloc_super_block(void) bgl_lock_init(sbi->s_blockgroup_lock); sbi->s_es = &fsb->es; + sbi->s_sb = sb; sb->s_fs_info = sbi; up_write(&sb->s_umount); @@ -802,6 +803,8 @@ static void test_mb_mark_used(struct kunit *test) KUNIT_ASSERT_EQ(test, ret, 0); grp->bb_free = EXT4_CLUSTERS_PER_GROUP(sb); + grp->bb_largest_free_order = -1; + grp->bb_avg_fragment_size_order = -1; mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); for (i = 0; i < TEST_RANGE_COUNT; i++) test_mb_mark_used_range(test, &e4b, ranges[i].start, @@ -875,6 +878,8 @@ static void test_mb_free_blocks(struct kunit *test) ext4_unlock_group(sb, TEST_GOAL_GROUP); grp->bb_free = 0; + grp->bb_largest_free_order = -1; + grp->bb_avg_fragment_size_order = -1; memset(bitmap, 0xff, sb->s_blocksize); mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 1e98c5be4e0a..5898d92ba19f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -132,25 +132,30 @@ * If "mb_optimize_scan" mount option is set, we maintain in memory group info * structures in two data structures: * - * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders) + * 1) Array of largest free order xarrays (sbi->s_mb_largest_free_orders) * - * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks) + * Locking: Writers use xa_lock, readers use rcu_read_lock. * - * This is an array of lists where the index in the array represents the + * This is an array of xarrays where the index in the array represents the * largest free order in the buddy bitmap of the participating group infos of - * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total - * number of buddy bitmap orders possible) number of lists. Group-infos are - * placed in appropriate lists. + * that xarray. So, there are exactly MB_NUM_ORDERS(sb) (which means total + * number of buddy bitmap orders possible) number of xarrays. Group-infos are + * placed in appropriate xarrays. * - * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size) + * 2) Average fragment size xarrays (sbi->s_mb_avg_fragment_size) * - * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks) + * Locking: Writers use xa_lock, readers use rcu_read_lock. * - * This is an array of lists where in the i-th list there are groups with + * This is an array of xarrays where in the i-th xarray there are groups with * average fragment size >= 2^i and < 2^(i+1). The average fragment size * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments. - * Note that we don't bother with a special list for completely empty groups - * so we only have MB_NUM_ORDERS(sb) lists. + * Note that we don't bother with a special xarray for completely empty + * groups so we only have MB_NUM_ORDERS(sb) xarrays. Group-infos are placed + * in appropriate xarrays. + * + * In xarray, the index is the block group number, the value is the block group + * information, and a non-empty value indicates the block group is present in + * the current xarray. * * When "mb_optimize_scan" mount option is set, mballoc consults the above data * structures to decide the order in which groups are to be traversed for @@ -420,8 +425,8 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); -static bool ext4_mb_good_group(struct ext4_allocation_context *ac, - ext4_group_t group, enum criteria cr); +static int ext4_mb_scan_group(struct ext4_allocation_context *ac, + ext4_group_t group); static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, @@ -841,132 +846,161 @@ static void mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); - int new_order; + int new, old; - if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_fragments == 0) + if (!test_opt2(sb, MB_OPTIMIZE_SCAN)) return; - new_order = mb_avg_fragment_size_order(sb, - grp->bb_free / grp->bb_fragments); - if (new_order == grp->bb_avg_fragment_size_order) + old = grp->bb_avg_fragment_size_order; + new = grp->bb_fragments == 0 ? -1 : + mb_avg_fragment_size_order(sb, grp->bb_free / grp->bb_fragments); + if (new == old) return; - if (grp->bb_avg_fragment_size_order != -1) { - write_lock(&sbi->s_mb_avg_fragment_size_locks[ - grp->bb_avg_fragment_size_order]); - list_del(&grp->bb_avg_fragment_size_node); - write_unlock(&sbi->s_mb_avg_fragment_size_locks[ - grp->bb_avg_fragment_size_order]); + if (old >= 0) + xa_erase(&sbi->s_mb_avg_fragment_size[old], grp->bb_group); + + grp->bb_avg_fragment_size_order = new; + if (new >= 0) { + /* + * Cannot use __GFP_NOFAIL because we hold the group lock. + * Although allocation for insertion may fails, it's not fatal + * as we have linear traversal to fall back on. + */ + int err = xa_insert(&sbi->s_mb_avg_fragment_size[new], + grp->bb_group, grp, GFP_ATOMIC); + if (err) + mb_debug(sb, "insert group: %u to s_mb_avg_fragment_size[%d] failed, err %d", + grp->bb_group, new, err); + } +} + +static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac, + struct xarray *xa, + ext4_group_t start, ext4_group_t end) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + enum criteria cr = ac->ac_criteria; + ext4_group_t ngroups = ext4_get_groups_count(sb); + unsigned long group = start; + struct ext4_group_info *grp; + + if (WARN_ON_ONCE(end > ngroups || start >= end)) + return 0; + + xa_for_each_range(xa, group, grp, start, end - 1) { + int err; + + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]); + + err = ext4_mb_scan_group(ac, grp->bb_group); + if (err || ac->ac_status != AC_STATUS_CONTINUE) + return err; + + cond_resched(); } - grp->bb_avg_fragment_size_order = new_order; - write_lock(&sbi->s_mb_avg_fragment_size_locks[ - grp->bb_avg_fragment_size_order]); - list_add_tail(&grp->bb_avg_fragment_size_node, - &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]); - write_unlock(&sbi->s_mb_avg_fragment_size_locks[ - grp->bb_avg_fragment_size_order]); + + return 0; +} + +/* + * Find a suitable group of given order from the largest free orders xarray. + */ +static inline int +ext4_mb_scan_groups_largest_free_order_range(struct ext4_allocation_context *ac, + int order, ext4_group_t start, + ext4_group_t end) +{ + struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_largest_free_orders[order]; + + if (xa_empty(xa)) + return 0; + + return ext4_mb_scan_groups_xa_range(ac, xa, start, end); } /* * Choose next group by traversing largest_free_order lists. Updates *new_cr if * cr level needs an update. */ -static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group) +static int ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac, + ext4_group_t group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_group_info *iter; int i; + int ret = 0; + ext4_group_t start, end; - if (ac->ac_status == AC_STATUS_FOUND) - return; - - if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED)) - atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions); - + start = group; + end = ext4_get_groups_count(ac->ac_sb); +wrap_around: for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { - if (list_empty(&sbi->s_mb_largest_free_orders[i])) - continue; - read_lock(&sbi->s_mb_largest_free_orders_locks[i]); - if (list_empty(&sbi->s_mb_largest_free_orders[i])) { - read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); - continue; - } - list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], - bb_largest_free_order_node) { - if (sbi->s_mb_stats) - atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]); - if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) { - *group = iter->bb_group; - ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED; - read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); - return; - } - } - read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); + ret = ext4_mb_scan_groups_largest_free_order_range(ac, i, + start, end); + if (ret || ac->ac_status != AC_STATUS_CONTINUE) + return ret; + } + if (start) { + end = start; + start = 0; + goto wrap_around; } + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); + /* Increment cr and search again if no group is found */ - *new_cr = CR_GOAL_LEN_FAST; + ac->ac_criteria = CR_GOAL_LEN_FAST; + return ret; } /* - * Find a suitable group of given order from the average fragments list. + * Find a suitable group of given order from the average fragments xarray. */ -static struct ext4_group_info * -ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int order) +static int +ext4_mb_scan_groups_avg_frag_order_range(struct ext4_allocation_context *ac, + int order, ext4_group_t start, + ext4_group_t end) { - struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct list_head *frag_list = &sbi->s_mb_avg_fragment_size[order]; - rwlock_t *frag_list_lock = &sbi->s_mb_avg_fragment_size_locks[order]; - struct ext4_group_info *grp = NULL, *iter; - enum criteria cr = ac->ac_criteria; + struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_avg_fragment_size[order]; - if (list_empty(frag_list)) - return NULL; - read_lock(frag_list_lock); - if (list_empty(frag_list)) { - read_unlock(frag_list_lock); - return NULL; - } - list_for_each_entry(iter, frag_list, bb_avg_fragment_size_node) { - if (sbi->s_mb_stats) - atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]); - if (likely(ext4_mb_good_group(ac, iter->bb_group, cr))) { - grp = iter; - break; - } - } - read_unlock(frag_list_lock); - return grp; + if (xa_empty(xa)) + return 0; + + return ext4_mb_scan_groups_xa_range(ac, xa, start, end); } /* * Choose next group by traversing average fragment size list of suitable * order. Updates *new_cr if cr level needs an update. */ -static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group) +static int ext4_mb_scan_groups_goal_fast(struct ext4_allocation_context *ac, + ext4_group_t group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_group_info *grp = NULL; - int i; - - if (unlikely(ac->ac_flags & EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED)) { - if (sbi->s_mb_stats) - atomic_inc(&sbi->s_bal_goal_fast_bad_suggestions); + int i, ret = 0; + ext4_group_t start, end; + + start = group; + end = ext4_get_groups_count(ac->ac_sb); +wrap_around: + i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); + for (; i < MB_NUM_ORDERS(ac->ac_sb); i++) { + ret = ext4_mb_scan_groups_avg_frag_order_range(ac, i, + start, end); + if (ret || ac->ac_status != AC_STATUS_CONTINUE) + return ret; } - - for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); - i < MB_NUM_ORDERS(ac->ac_sb); i++) { - grp = ext4_mb_find_good_group_avg_frag_lists(ac, i); - if (grp) { - *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED; - return; - } + if (start) { + end = start; + start = 0; + goto wrap_around; } + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); /* * CR_BEST_AVAIL_LEN works based on the concept that we have * a larger normalized goal len request which can be trimmed to @@ -976,9 +1010,11 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context * * See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA). */ if (ac->ac_flags & EXT4_MB_HINT_DATA) - *new_cr = CR_BEST_AVAIL_LEN; + ac->ac_criteria = CR_BEST_AVAIL_LEN; else - *new_cr = CR_GOAL_LEN_SLOW; + ac->ac_criteria = CR_GOAL_LEN_SLOW; + + return ret; } /* @@ -990,18 +1026,14 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context * * preallocations. However, we make sure that we don't trim the request too * much and fall to CR_GOAL_LEN_SLOW in that case. */ -static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group) +static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac, + ext4_group_t group) { + int ret = 0; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_group_info *grp = NULL; int i, order, min_order; unsigned long num_stripe_clusters = 0; - - if (unlikely(ac->ac_flags & EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED)) { - if (sbi->s_mb_stats) - atomic_inc(&sbi->s_bal_best_avail_bad_suggestions); - } + ext4_group_t start, end; /* * mb_avg_fragment_size_order() returns order in a way that makes @@ -1033,6 +1065,9 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context if (1 << min_order < ac->ac_o_ex.fe_len) min_order = fls(ac->ac_o_ex.fe_len); + start = group; + end = ext4_get_groups_count(ac->ac_sb); +wrap_around: for (i = order; i >= min_order; i--) { int frag_order; /* @@ -1055,17 +1090,24 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context frag_order = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); - grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order); - if (grp) { - *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED; - return; - } + ret = ext4_mb_scan_groups_avg_frag_order_range(ac, frag_order, + start, end); + if (ret || ac->ac_status != AC_STATUS_CONTINUE) + return ret; + } + if (start) { + end = start; + start = 0; + goto wrap_around; } /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; - *new_cr = CR_GOAL_LEN_SLOW; + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); + ac->ac_criteria = CR_GOAL_LEN_SLOW; + + return ret; } static inline int should_optimize_scan(struct ext4_allocation_context *ac) @@ -1080,59 +1122,82 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac) } /* - * Return next linear group for allocation. + * next linear group for allocation. */ -static ext4_group_t -next_linear_group(ext4_group_t group, ext4_group_t ngroups) +static void next_linear_group(ext4_group_t *group, ext4_group_t ngroups) { /* * Artificially restricted ngroups for non-extent * files makes group > ngroups possible on first loop. */ - return group + 1 >= ngroups ? 0 : group + 1; + *group = *group + 1 >= ngroups ? 0 : *group + 1; } -/* - * ext4_mb_choose_next_group: choose next group for allocation. - * - * @ac Allocation Context - * @new_cr This is an output parameter. If the there is no good group - * available at current CR level, this field is updated to indicate - * the new cr level that should be used. - * @group This is an input / output parameter. As an input it indicates the - * next group that the allocator intends to use for allocation. As - * output, this field indicates the next group that should be used as - * determined by the optimization functions. - * @ngroups Total number of groups - */ -static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, - enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) +static int ext4_mb_scan_groups_linear(struct ext4_allocation_context *ac, + ext4_group_t ngroups, ext4_group_t *start, ext4_group_t count) { - *new_cr = ac->ac_criteria; + int ret, i; + enum criteria cr = ac->ac_criteria; + struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t group = *start; - if (!should_optimize_scan(ac)) { - *group = next_linear_group(*group, ngroups); - return; + for (i = 0; i < count; i++, next_linear_group(&group, ngroups)) { + ret = ext4_mb_scan_group(ac, group); + if (ret || ac->ac_status != AC_STATUS_CONTINUE) + return ret; + cond_resched(); } + *start = group; + if (count == ngroups) + ac->ac_criteria++; + + /* Processed all groups and haven't found blocks */ + if (sbi->s_mb_stats && i == ngroups) + atomic64_inc(&sbi->s_bal_cX_failed[cr]); + + return 0; +} + +static int ext4_mb_scan_groups(struct ext4_allocation_context *ac) +{ + int ret = 0; + ext4_group_t start; + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb); + + /* non-extent files are limited to low blocks/groups */ + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) + ngroups = sbi->s_blockfile_groups; + + /* searching for the right group start from the goal value specified */ + start = ac->ac_g_ex.fe_group; + ac->ac_prefetch_grp = start; + ac->ac_prefetch_nr = 0; + + if (!should_optimize_scan(ac)) + return ext4_mb_scan_groups_linear(ac, ngroups, &start, ngroups); + /* * Optimized scanning can return non adjacent groups which can cause * seek overhead for rotational disks. So try few linear groups before * trying optimized scan. */ - if (ac->ac_groups_linear_remaining) { - *group = next_linear_group(*group, ngroups); - ac->ac_groups_linear_remaining--; - return; - } + if (sbi->s_mb_max_linear_groups) + ret = ext4_mb_scan_groups_linear(ac, ngroups, &start, + sbi->s_mb_max_linear_groups); + if (ret || ac->ac_status != AC_STATUS_CONTINUE) + return ret; - if (*new_cr == CR_POWER2_ALIGNED) { - ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group); - } else if (*new_cr == CR_GOAL_LEN_FAST) { - ext4_mb_choose_next_group_goal_fast(ac, new_cr, group); - } else if (*new_cr == CR_BEST_AVAIL_LEN) { - ext4_mb_choose_next_group_best_avail(ac, new_cr, group); - } else { + switch (ac->ac_criteria) { + case CR_POWER2_ALIGNED: + return ext4_mb_scan_groups_p2_aligned(ac, start); + case CR_GOAL_LEN_FAST: + return ext4_mb_scan_groups_goal_fast(ac, start); + case CR_BEST_AVAIL_LEN: + return ext4_mb_scan_groups_best_avail(ac, start); + default: /* * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an * rb tree sorted by bb_free. But until that happens, we should @@ -1140,6 +1205,8 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, */ WARN_ON(1); } + + return 0; } /* @@ -1150,33 +1217,35 @@ static void mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); - int i; + int new, old = grp->bb_largest_free_order; - for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) - if (grp->bb_counters[i] > 0) + for (new = MB_NUM_ORDERS(sb) - 1; new >= 0; new--) + if (grp->bb_counters[new] > 0) break; + /* No need to move between order lists? */ - if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || - i == grp->bb_largest_free_order) { - grp->bb_largest_free_order = i; + if (new == old) return; - } - if (grp->bb_largest_free_order >= 0) { - write_lock(&sbi->s_mb_largest_free_orders_locks[ - grp->bb_largest_free_order]); - list_del_init(&grp->bb_largest_free_order_node); - write_unlock(&sbi->s_mb_largest_free_orders_locks[ - grp->bb_largest_free_order]); + if (old >= 0) { + struct xarray *xa = &sbi->s_mb_largest_free_orders[old]; + + if (!xa_empty(xa) && xa_load(xa, grp->bb_group)) + xa_erase(xa, grp->bb_group); } - grp->bb_largest_free_order = i; - if (grp->bb_largest_free_order >= 0 && grp->bb_free) { - write_lock(&sbi->s_mb_largest_free_orders_locks[ - grp->bb_largest_free_order]); - list_add_tail(&grp->bb_largest_free_order_node, - &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]); - write_unlock(&sbi->s_mb_largest_free_orders_locks[ - grp->bb_largest_free_order]); + + grp->bb_largest_free_order = new; + if (test_opt2(sb, MB_OPTIMIZE_SCAN) && new >= 0 && grp->bb_free) { + /* + * Cannot use __GFP_NOFAIL because we hold the group lock. + * Although allocation for insertion may fails, it's not fatal + * as we have linear traversal to fall back on. + */ + int err = xa_insert(&sbi->s_mb_largest_free_orders[new], + grp->bb_group, grp, GFP_ATOMIC); + if (err) + mb_debug(sb, "insert group: %u to s_mb_largest_free_orders[%d] failed, err %d", + grp->bb_group, new, err); } } @@ -2167,11 +2236,11 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, folio_get(ac->ac_buddy_folio); /* store last allocated for subsequent stream allocation */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { - spin_lock(&sbi->s_md_lock); - sbi->s_mb_last_group = ac->ac_f_ex.fe_group; - sbi->s_mb_last_start = ac->ac_f_ex.fe_start; - spin_unlock(&sbi->s_md_lock); + int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; + + WRITE_ONCE(sbi->s_mb_last_groups[hash], ac->ac_f_ex.fe_group); } + /* * As we've just preallocated more space than * user requested originally, we store allocated @@ -2571,6 +2640,30 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, } } +static void __ext4_mb_scan_group(struct ext4_allocation_context *ac) +{ + bool is_stripe_aligned; + struct ext4_sb_info *sbi; + enum criteria cr = ac->ac_criteria; + + ac->ac_groups_scanned++; + if (cr == CR_POWER2_ALIGNED) + return ext4_mb_simple_scan_group(ac, ac->ac_e4b); + + sbi = EXT4_SB(ac->ac_sb); + is_stripe_aligned = false; + if ((sbi->s_stripe >= sbi->s_cluster_ratio) && + !(ac->ac_g_ex.fe_len % EXT4_NUM_B2C(sbi, sbi->s_stripe))) + is_stripe_aligned = true; + + if ((cr == CR_GOAL_LEN_FAST || cr == CR_BEST_AVAIL_LEN) && + is_stripe_aligned) + ext4_mb_scan_aligned(ac, ac->ac_e4b); + + if (ac->ac_status == AC_STATUS_CONTINUE) + ext4_mb_complex_scan_group(ac, ac->ac_e4b); +} + /* * This is also called BEFORE we load the buddy bitmap. * Returns either 1 or 0 indicating that the group is either suitable @@ -2761,6 +2854,37 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, } /* + * Batch reads of the block allocation bitmaps to get + * multiple READs in flight; limit prefetching at inexpensive + * CR, otherwise mballoc can spend a lot of time loading + * imperfect groups + */ +static void ext4_mb_might_prefetch(struct ext4_allocation_context *ac, + ext4_group_t group) +{ + struct ext4_sb_info *sbi; + + if (ac->ac_prefetch_grp != group) + return; + + sbi = EXT4_SB(ac->ac_sb); + if (ext4_mb_cr_expensive(ac->ac_criteria) || + ac->ac_prefetch_ios < sbi->s_mb_prefetch_limit) { + unsigned int nr = sbi->s_mb_prefetch; + + if (ext4_has_feature_flex_bg(ac->ac_sb)) { + nr = 1 << sbi->s_log_groups_per_flex; + nr -= group & (nr - 1); + nr = umin(nr, sbi->s_mb_prefetch); + } + + ac->ac_prefetch_nr = nr; + ac->ac_prefetch_grp = ext4_mb_prefetch(ac->ac_sb, group, nr, + &ac->ac_prefetch_ios); + } +} + +/* * Prefetching reads the block bitmap into the buffer cache; but we * need to make sure that the buddy bitmap in the page cache has been * initialized. Note that ext4_mb_init_group() will block if the I/O @@ -2793,24 +2917,58 @@ void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, } } +static int ext4_mb_scan_group(struct ext4_allocation_context *ac, + ext4_group_t group) +{ + int ret; + struct super_block *sb = ac->ac_sb; + enum criteria cr = ac->ac_criteria; + + ext4_mb_might_prefetch(ac, group); + + /* prevent unnecessary buddy loading. */ + if (cr < CR_ANY_FREE && spin_is_locked(ext4_group_lock_ptr(sb, group))) + return 0; + + /* This now checks without needing the buddy page */ + ret = ext4_mb_good_group_nolock(ac, group, cr); + if (ret <= 0) { + if (!ac->ac_first_err) + ac->ac_first_err = ret; + return 0; + } + + ret = ext4_mb_load_buddy(sb, group, ac->ac_e4b); + if (ret) + return ret; + + /* skip busy group */ + if (cr >= CR_ANY_FREE) + ext4_lock_group(sb, group); + else if (!ext4_try_lock_group(sb, group)) + goto out_unload; + + /* We need to check again after locking the block group. */ + if (unlikely(!ext4_mb_good_group(ac, group, cr))) + goto out_unlock; + + __ext4_mb_scan_group(ac); + +out_unlock: + ext4_unlock_group(sb, group); +out_unload: + ext4_mb_unload_buddy(ac->ac_e4b); + return ret; +} + static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { - ext4_group_t prefetch_grp = 0, ngroups, group, i; - enum criteria new_cr, cr = CR_GOAL_LEN_FAST; - int err = 0, first_err = 0; - unsigned int nr = 0, prefetch_ios = 0; - struct ext4_sb_info *sbi; - struct super_block *sb; + ext4_group_t i; + int err = 0; + struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_buddy e4b; - int lost; - - sb = ac->ac_sb; - sbi = EXT4_SB(sb); - ngroups = ext4_get_groups_count(sb); - /* non-extent files are limited to low blocks/groups */ - if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) - ngroups = sbi->s_blockfile_groups; BUG_ON(ac->ac_status == AC_STATUS_FOUND); @@ -2844,11 +3002,11 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) /* if stream allocation is enabled, use global goal */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { - /* TBD: may be hot point */ - spin_lock(&sbi->s_md_lock); - ac->ac_g_ex.fe_group = sbi->s_mb_last_group; - ac->ac_g_ex.fe_start = sbi->s_mb_last_start; - spin_unlock(&sbi->s_md_lock); + int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; + + ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]); + ac->ac_g_ex.fe_start = -1; + ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL; } /* @@ -2856,107 +3014,21 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) * start with CR_GOAL_LEN_FAST, unless it is power of 2 * aligned, in which case let's do that faster approach first. */ + ac->ac_criteria = CR_GOAL_LEN_FAST; if (ac->ac_2order) - cr = CR_POWER2_ALIGNED; -repeat: - for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) { - ac->ac_criteria = cr; - /* - * searching for the right group start - * from the goal value specified - */ - group = ac->ac_g_ex.fe_group; - ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; - prefetch_grp = group; - nr = 0; - - for (i = 0, new_cr = cr; i < ngroups; i++, - ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { - int ret = 0; - - cond_resched(); - if (new_cr != cr) { - cr = new_cr; - goto repeat; - } - - /* - * Batch reads of the block allocation bitmaps - * to get multiple READs in flight; limit - * prefetching at inexpensive CR, otherwise mballoc - * can spend a lot of time loading imperfect groups - */ - if ((prefetch_grp == group) && - (ext4_mb_cr_expensive(cr) || - prefetch_ios < sbi->s_mb_prefetch_limit)) { - nr = sbi->s_mb_prefetch; - if (ext4_has_feature_flex_bg(sb)) { - nr = 1 << sbi->s_log_groups_per_flex; - nr -= group & (nr - 1); - nr = min(nr, sbi->s_mb_prefetch); - } - prefetch_grp = ext4_mb_prefetch(sb, group, - nr, &prefetch_ios); - } - - /* This now checks without needing the buddy page */ - ret = ext4_mb_good_group_nolock(ac, group, cr); - if (ret <= 0) { - if (!first_err) - first_err = ret; - continue; - } - - err = ext4_mb_load_buddy(sb, group, &e4b); - if (err) - goto out; - - ext4_lock_group(sb, group); - - /* - * We need to check again after locking the - * block group - */ - ret = ext4_mb_good_group(ac, group, cr); - if (ret == 0) { - ext4_unlock_group(sb, group); - ext4_mb_unload_buddy(&e4b); - continue; - } + ac->ac_criteria = CR_POWER2_ALIGNED; - ac->ac_groups_scanned++; - if (cr == CR_POWER2_ALIGNED) - ext4_mb_simple_scan_group(ac, &e4b); - else { - bool is_stripe_aligned = - (sbi->s_stripe >= - sbi->s_cluster_ratio) && - !(ac->ac_g_ex.fe_len % - EXT4_NUM_B2C(sbi, sbi->s_stripe)); - - if ((cr == CR_GOAL_LEN_FAST || - cr == CR_BEST_AVAIL_LEN) && - is_stripe_aligned) - ext4_mb_scan_aligned(ac, &e4b); - - if (ac->ac_status == AC_STATUS_CONTINUE) - ext4_mb_complex_scan_group(ac, &e4b); - } - - ext4_unlock_group(sb, group); - ext4_mb_unload_buddy(&e4b); - - if (ac->ac_status != AC_STATUS_CONTINUE) - break; - } - /* Processed all groups and haven't found blocks */ - if (sbi->s_mb_stats && i == ngroups) - atomic64_inc(&sbi->s_bal_cX_failed[cr]); + ac->ac_e4b = &e4b; + ac->ac_prefetch_ios = 0; + ac->ac_first_err = 0; +repeat: + while (ac->ac_criteria < EXT4_MB_NUM_CRS) { + err = ext4_mb_scan_groups(ac); + if (err) + goto out; - if (i == ngroups && ac->ac_criteria == CR_BEST_AVAIL_LEN) - /* Reset goal length to original goal length before - * falling into CR_GOAL_LEN_SLOW */ - ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; + if (ac->ac_status != AC_STATUS_CONTINUE) + break; } if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && @@ -2967,6 +3039,8 @@ repeat: */ ext4_mb_try_best_found(ac, &e4b); if (ac->ac_status != AC_STATUS_FOUND) { + int lost; + /* * Someone more lucky has already allocated it. * The only thing we can do is just take first @@ -2982,23 +3056,27 @@ repeat: ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; ac->ac_flags |= EXT4_MB_HINT_FIRST; - cr = CR_ANY_FREE; + ac->ac_criteria = CR_ANY_FREE; goto repeat; } } - if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) + if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) { atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]); + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC && + ac->ac_b_ex.fe_group == ac->ac_g_ex.fe_group) + atomic_inc(&sbi->s_bal_stream_goals); + } out: - if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) - err = first_err; + if (!err && ac->ac_status != AC_STATUS_FOUND && ac->ac_first_err) + err = ac->ac_first_err; mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n", ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status, - ac->ac_flags, cr, err); + ac->ac_flags, ac->ac_criteria, err); - if (nr) - ext4_mb_prefetch_fini(sb, prefetch_grp, nr); + if (ac->ac_prefetch_nr) + ext4_mb_prefetch_fini(sb, ac->ac_prefetch_grp, ac->ac_prefetch_nr); return err; } @@ -3121,8 +3199,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED])); - seq_printf(seq, "\t\tbad_suggestions: %u\n", - atomic_read(&sbi->s_bal_p2_aligned_bad_suggestions)); /* CR_GOAL_LEN_FAST stats */ seq_puts(seq, "\tcr_goal_fast_stats:\n"); @@ -3135,8 +3211,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST])); - seq_printf(seq, "\t\tbad_suggestions: %u\n", - atomic_read(&sbi->s_bal_goal_fast_bad_suggestions)); /* CR_BEST_AVAIL_LEN stats */ seq_puts(seq, "\tcr_best_avail_stats:\n"); @@ -3150,8 +3224,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN])); - seq_printf(seq, "\t\tbad_suggestions: %u\n", - atomic_read(&sbi->s_bal_best_avail_bad_suggestions)); /* CR_GOAL_LEN_SLOW stats */ seq_puts(seq, "\tcr_goal_slow_stats:\n"); @@ -3181,6 +3253,8 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); + seq_printf(seq, "\t\tstream_goal_hits: %u\n", + atomic_read(&sbi->s_bal_stream_goals)); seq_printf(seq, "\t\tlen_goal_hits: %u\n", atomic_read(&sbi->s_bal_len_goals)); seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); @@ -3227,6 +3301,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) unsigned long position = ((unsigned long) v); struct ext4_group_info *grp; unsigned int count; + unsigned long idx; position--; if (position >= MB_NUM_ORDERS(sb)) { @@ -3235,11 +3310,8 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) seq_puts(seq, "avg_fragment_size_lists:\n"); count = 0; - read_lock(&sbi->s_mb_avg_fragment_size_locks[position]); - list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position], - bb_avg_fragment_size_node) + xa_for_each(&sbi->s_mb_avg_fragment_size[position], idx, grp) count++; - read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]); seq_printf(seq, "\tlist_order_%u_groups: %u\n", (unsigned int)position, count); return 0; @@ -3251,11 +3323,8 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) seq_puts(seq, "max_free_order_lists:\n"); } count = 0; - read_lock(&sbi->s_mb_largest_free_orders_locks[position]); - list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position], - bb_largest_free_order_node) + xa_for_each(&sbi->s_mb_largest_free_orders[position], idx, grp) count++; - read_unlock(&sbi->s_mb_largest_free_orders_locks[position]); seq_printf(seq, "\tlist_order_%u_groups: %u\n", (unsigned int)position, count); @@ -3375,8 +3444,6 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); init_rwsem(&meta_group_info[i]->alloc_sem); meta_group_info[i]->bb_free_root = RB_ROOT; - INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); - INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node); meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */ meta_group_info[i]->bb_group = group; @@ -3586,6 +3653,20 @@ static void ext4_discard_work(struct work_struct *work) ext4_mb_unload_buddy(&e4b); } +static inline void ext4_mb_avg_fragment_size_destroy(struct ext4_sb_info *sbi) +{ + for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++) + xa_destroy(&sbi->s_mb_avg_fragment_size[i]); + kfree(sbi->s_mb_avg_fragment_size); +} + +static inline void ext4_mb_largest_free_orders_destroy(struct ext4_sb_info *sbi) +{ + for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++) + xa_destroy(&sbi->s_mb_largest_free_orders[i]); + kfree(sbi->s_mb_largest_free_orders); +} + int ext4_mb_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -3631,44 +3712,27 @@ int ext4_mb_init(struct super_block *sb) } while (i < MB_NUM_ORDERS(sb)); sbi->s_mb_avg_fragment_size = - kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct xarray), GFP_KERNEL); if (!sbi->s_mb_avg_fragment_size) { ret = -ENOMEM; goto out; } - sbi->s_mb_avg_fragment_size_locks = - kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), - GFP_KERNEL); - if (!sbi->s_mb_avg_fragment_size_locks) { - ret = -ENOMEM; - goto out; - } - for (i = 0; i < MB_NUM_ORDERS(sb); i++) { - INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]); - rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]); - } + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + xa_init(&sbi->s_mb_avg_fragment_size[i]); + sbi->s_mb_largest_free_orders = - kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct xarray), GFP_KERNEL); if (!sbi->s_mb_largest_free_orders) { ret = -ENOMEM; goto out; } - sbi->s_mb_largest_free_orders_locks = - kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), - GFP_KERNEL); - if (!sbi->s_mb_largest_free_orders_locks) { - ret = -ENOMEM; - goto out; - } - for (i = 0; i < MB_NUM_ORDERS(sb); i++) { - INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); - rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); - } + for (i = 0; i < MB_NUM_ORDERS(sb); i++) + xa_init(&sbi->s_mb_largest_free_orders[i]); spin_lock_init(&sbi->s_md_lock); - sbi->s_mb_free_pending = 0; + atomic_set(&sbi->s_mb_free_pending, 0); INIT_LIST_HEAD(&sbi->s_freed_data_list[0]); INIT_LIST_HEAD(&sbi->s_freed_data_list[1]); INIT_LIST_HEAD(&sbi->s_discard_list); @@ -3709,10 +3773,19 @@ int ext4_mb_init(struct super_block *sb) sbi->s_mb_group_prealloc, EXT4_NUM_B2C(sbi, sbi->s_stripe)); } + sbi->s_mb_nr_global_goals = umin(num_possible_cpus(), + DIV_ROUND_UP(sbi->s_groups_count, 4)); + sbi->s_mb_last_groups = kcalloc(sbi->s_mb_nr_global_goals, + sizeof(ext4_group_t), GFP_KERNEL); + if (sbi->s_mb_last_groups == NULL) { + ret = -ENOMEM; + goto out; + } + sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { ret = -ENOMEM; - goto out; + goto out_free_last_groups; } for_each_possible_cpu(i) { struct ext4_locality_group *lg; @@ -3737,11 +3810,12 @@ int ext4_mb_init(struct super_block *sb) out_free_locality_groups: free_percpu(sbi->s_locality_groups); sbi->s_locality_groups = NULL; +out_free_last_groups: + kfree(sbi->s_mb_last_groups); + sbi->s_mb_last_groups = NULL; out: - kfree(sbi->s_mb_avg_fragment_size); - kfree(sbi->s_mb_avg_fragment_size_locks); - kfree(sbi->s_mb_largest_free_orders); - kfree(sbi->s_mb_largest_free_orders_locks); + ext4_mb_avg_fragment_size_destroy(sbi); + ext4_mb_largest_free_orders_destroy(sbi); kfree(sbi->s_mb_offsets); sbi->s_mb_offsets = NULL; kfree(sbi->s_mb_maxs); @@ -3808,10 +3882,8 @@ void ext4_mb_release(struct super_block *sb) kvfree(group_info); rcu_read_unlock(); } - kfree(sbi->s_mb_avg_fragment_size); - kfree(sbi->s_mb_avg_fragment_size_locks); - kfree(sbi->s_mb_largest_free_orders); - kfree(sbi->s_mb_largest_free_orders_locks); + ext4_mb_avg_fragment_size_destroy(sbi); + ext4_mb_largest_free_orders_destroy(sbi); kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); iput(sbi->s_buddy_cache); @@ -3841,6 +3913,7 @@ void ext4_mb_release(struct super_block *sb) } free_percpu(sbi->s_locality_groups); + kfree(sbi->s_mb_last_groups); } static inline int ext4_issue_discard(struct super_block *sb, @@ -3871,10 +3944,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb, /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); - spin_lock(&EXT4_SB(sb)->s_md_lock); - EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count; - spin_unlock(&EXT4_SB(sb)->s_md_lock); - + atomic_sub(entry->efd_count, &EXT4_SB(sb)->s_mb_free_pending); db = e4b.bd_info; /* there are blocks to put in buddy to make them really free */ count += entry->efd_count; @@ -6278,28 +6348,63 @@ out: * are contiguous, AND the extents were freed by the same transaction, * AND the blocks are associated with the same group. */ -static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi, - struct ext4_free_data *entry, - struct ext4_free_data *new_entry, - struct rb_root *entry_rb_root) +static inline bool +ext4_freed_extents_can_be_merged(struct ext4_free_data *entry1, + struct ext4_free_data *entry2) { - if ((entry->efd_tid != new_entry->efd_tid) || - (entry->efd_group != new_entry->efd_group)) - return; - if (entry->efd_start_cluster + entry->efd_count == - new_entry->efd_start_cluster) { - new_entry->efd_start_cluster = entry->efd_start_cluster; - new_entry->efd_count += entry->efd_count; - } else if (new_entry->efd_start_cluster + new_entry->efd_count == - entry->efd_start_cluster) { - new_entry->efd_count += entry->efd_count; - } else - return; + if (entry1->efd_tid != entry2->efd_tid) + return false; + if (entry1->efd_start_cluster + entry1->efd_count != + entry2->efd_start_cluster) + return false; + if (WARN_ON_ONCE(entry1->efd_group != entry2->efd_group)) + return false; + return true; +} + +static inline void +ext4_merge_freed_extents(struct ext4_sb_info *sbi, struct rb_root *root, + struct ext4_free_data *entry1, + struct ext4_free_data *entry2) +{ + entry1->efd_count += entry2->efd_count; spin_lock(&sbi->s_md_lock); - list_del(&entry->efd_list); + list_del(&entry2->efd_list); spin_unlock(&sbi->s_md_lock); - rb_erase(&entry->efd_node, entry_rb_root); - kmem_cache_free(ext4_free_data_cachep, entry); + rb_erase(&entry2->efd_node, root); + kmem_cache_free(ext4_free_data_cachep, entry2); +} + +static inline void +ext4_try_merge_freed_extent_prev(struct ext4_sb_info *sbi, struct rb_root *root, + struct ext4_free_data *entry) +{ + struct ext4_free_data *prev; + struct rb_node *node; + + node = rb_prev(&entry->efd_node); + if (!node) + return; + + prev = rb_entry(node, struct ext4_free_data, efd_node); + if (ext4_freed_extents_can_be_merged(prev, entry)) + ext4_merge_freed_extents(sbi, root, prev, entry); +} + +static inline void +ext4_try_merge_freed_extent_next(struct ext4_sb_info *sbi, struct rb_root *root, + struct ext4_free_data *entry) +{ + struct ext4_free_data *next; + struct rb_node *node; + + node = rb_next(&entry->efd_node); + if (!node) + return; + + next = rb_entry(node, struct ext4_free_data, efd_node); + if (ext4_freed_extents_can_be_merged(entry, next)) + ext4_merge_freed_extents(sbi, root, entry, next); } static noinline_for_stack void @@ -6309,11 +6414,12 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, ext4_group_t group = e4b->bd_group; ext4_grpblk_t cluster; ext4_grpblk_t clusters = new_entry->efd_count; - struct ext4_free_data *entry; + struct ext4_free_data *entry = NULL; struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct rb_node **n = &db->bb_free_root.rb_node, *node; + struct rb_root *root = &db->bb_free_root; + struct rb_node **n = &root->rb_node; struct rb_node *parent = NULL, *new_node; BUG_ON(!ext4_handle_valid(handle)); @@ -6349,27 +6455,30 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, } } - rb_link_node(new_node, parent, n); - rb_insert_color(new_node, &db->bb_free_root); + atomic_add(clusters, &sbi->s_mb_free_pending); + if (!entry) + goto insert; - /* Now try to see the extent can be merged to left and right */ - node = rb_prev(new_node); - if (node) { - entry = rb_entry(node, struct ext4_free_data, efd_node); - ext4_try_merge_freed_extent(sbi, entry, new_entry, - &(db->bb_free_root)); + /* Now try to see the extent can be merged to prev and next */ + if (ext4_freed_extents_can_be_merged(new_entry, entry)) { + entry->efd_start_cluster = cluster; + entry->efd_count += new_entry->efd_count; + kmem_cache_free(ext4_free_data_cachep, new_entry); + ext4_try_merge_freed_extent_prev(sbi, root, entry); + return; } - - node = rb_next(new_node); - if (node) { - entry = rb_entry(node, struct ext4_free_data, efd_node); - ext4_try_merge_freed_extent(sbi, entry, new_entry, - &(db->bb_free_root)); + if (ext4_freed_extents_can_be_merged(entry, new_entry)) { + entry->efd_count += new_entry->efd_count; + kmem_cache_free(ext4_free_data_cachep, new_entry); + ext4_try_merge_freed_extent_next(sbi, root, entry); + return; } +insert: + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, root); spin_lock(&sbi->s_md_lock); list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]); - sbi->s_mb_free_pending += clusters; spin_unlock(&sbi->s_md_lock); } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index f8280de3e882..15a049f05d04 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -192,8 +192,13 @@ struct ext4_allocation_context { */ ext4_grpblk_t ac_orig_goal_len; + ext4_group_t ac_prefetch_grp; + unsigned int ac_prefetch_ios; + unsigned int ac_prefetch_nr; + + int ac_first_err; + __u32 ac_flags; /* allocation hints */ - __u32 ac_groups_linear_remaining; __u16 ac_groups_scanned; __u16 ac_found; __u16 ac_cX_found[EXT4_MB_NUM_CRS]; @@ -204,6 +209,8 @@ struct ext4_allocation_context { __u8 ac_2order; /* if request is to allocate 2^N blocks and * N > 0, the field stores N, otherwise 0 */ __u8 ac_op; /* operation, for history only */ + + struct ext4_buddy *ac_e4b; struct folio *ac_bitmap_folio; struct folio *ac_buddy_folio; struct ext4_prealloc_space *ac_pa; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 1f8493a56e8f..adae3caf175a 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -280,7 +280,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, */ again: *err = 0; - jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; + jblocks = ext4_meta_trans_blocks(orig_inode, block_len_in_page, + block_len_in_page) * 2; handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks); if (IS_ERR(handle)) { *err = PTR_ERR(handle); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a178ac229489..d83f91b62317 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2915,33 +2915,50 @@ err_unlock_inode: return err; } -struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, - struct ext4_dir_entry_2 *de, - int blocksize, int csum_size, - unsigned int parent_ino, int dotdot_real_len) +int ext4_init_dirblock(handle_t *handle, struct inode *inode, + struct buffer_head *bh, unsigned int parent_ino, + void *inline_buf, int inline_size) { + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) bh->b_data; + size_t blocksize = bh->b_size; + int csum_size = 0, header_size; + + if (ext4_has_feature_metadata_csum(inode->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + de->inode = cpu_to_le32(inode->i_ino); de->name_len = 1; de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL), blocksize); - strcpy(de->name, "."); + memcpy(de->name, ".", 2); ext4_set_de_type(inode->i_sb, de, S_IFDIR); de = ext4_next_entry(de, blocksize); de->inode = cpu_to_le32(parent_ino); de->name_len = 2; - if (!dotdot_real_len) - de->rec_len = ext4_rec_len_to_disk(blocksize - - (csum_size + ext4_dir_rec_len(1, NULL)), - blocksize); - else + memcpy(de->name, "..", 3); + ext4_set_de_type(inode->i_sb, de, S_IFDIR); + if (inline_buf) { de->rec_len = ext4_rec_len_to_disk( ext4_dir_rec_len(de->name_len, NULL), blocksize); - strcpy(de->name, ".."); - ext4_set_de_type(inode->i_sb, de, S_IFDIR); + de = ext4_next_entry(de, blocksize); + header_size = (char *)de - bh->b_data; + memcpy((void *)de, inline_buf, inline_size); + ext4_update_final_de(bh->b_data, inline_size + header_size, + blocksize - csum_size); + } else { + de->rec_len = ext4_rec_len_to_disk(blocksize - + (csum_size + ext4_dir_rec_len(1, NULL)), + blocksize); + } - return ext4_next_entry(de, blocksize); + if (csum_size) + ext4_initialize_dirent_tail(bh, blocksize); + BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); + set_buffer_uptodate(bh); + set_buffer_verified(bh); + return ext4_handle_dirty_dirblock(handle, inode, bh); } int ext4_init_new_dir(handle_t *handle, struct inode *dir, @@ -2950,13 +2967,8 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir, struct buffer_head *dir_block = NULL; struct ext4_dir_entry_2 *de; ext4_lblk_t block = 0; - unsigned int blocksize = dir->i_sb->s_blocksize; - int csum_size = 0; int err; - if (ext4_has_feature_metadata_csum(dir->i_sb)) - csum_size = sizeof(struct ext4_dir_entry_tail); - if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { err = ext4_try_create_inline_dir(handle, dir, inode); if (err < 0 && err != -ENOSPC) @@ -2965,21 +2977,15 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir, goto out; } + set_nlink(inode, 2); inode->i_size = 0; dir_block = ext4_append(handle, inode, &block); if (IS_ERR(dir_block)) return PTR_ERR(dir_block); de = (struct ext4_dir_entry_2 *)dir_block->b_data; - ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); - set_nlink(inode, 2); - if (csum_size) - ext4_initialize_dirent_tail(dir_block, blocksize); - - BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_dirblock(handle, inode, dir_block); + err = ext4_init_dirblock(handle, inode, dir_block, dir->i_ino, NULL, 0); if (err) goto out; - set_buffer_verified(dir_block); out: brelse(dir_block); return err; @@ -3082,7 +3088,8 @@ bool ext4_empty_dir(struct inode *inode) de = (struct ext4_dir_entry_2 *) bh->b_data; if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, 0) || - le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) { + le32_to_cpu(de->inode) != inode->i_ino || de->name_len != 1 || + de->name[0] != '.') { ext4_warning_inode(inode, "directory missing '.'"); brelse(bh); return false; @@ -3091,7 +3098,8 @@ bool ext4_empty_dir(struct inode *inode) de = ext4_next_entry(de, sb->s_blocksize); if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, offset) || - le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { + le32_to_cpu(de->inode) == 0 || de->name_len != 2 || + de->name[0] != '.' || de->name[1] != '.') { ext4_warning_inode(inode, "directory missing '..'"); brelse(bh); return false; @@ -3532,7 +3540,7 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, 0) || le32_to_cpu(de->inode) != inode->i_ino || - strcmp(".", de->name)) { + de->name_len != 1 || de->name[0] != '.') { EXT4_ERROR_INODE(inode, "directory missing '.'"); brelse(bh); *retval = -EFSCORRUPTED; @@ -3543,7 +3551,8 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, de = ext4_next_entry(de, inode->i_sb->s_blocksize); if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, offset) || - le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { + le32_to_cpu(de->inode) == 0 || de->name_len != 2 || + de->name[0] != '.' || de->name[1] != '.') { EXT4_ERROR_INODE(inode, "directory missing '..'"); brelse(bh); *retval = -EFSCORRUPTED; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 179e54f3a3b6..3d8b0f6d2dea 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -236,10 +236,12 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head) static bool ext4_io_end_defer_completion(ext4_io_end_t *io_end) { - if (io_end->flag & EXT4_IO_END_UNWRITTEN) + if (io_end->flag & EXT4_IO_END_UNWRITTEN && + !list_empty(&io_end->list_vec)) return true; if (test_opt(io_end->inode->i_sb, DATA_ERR_ABORT) && - io_end->flag & EXT4_IO_END_FAILED) + io_end->flag & EXT4_IO_END_FAILED && + !ext4_emergency_state(io_end->inode->i_sb)) return true; return false; } @@ -256,6 +258,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) WARN_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION)); WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN && !io_end->handle && sbi->s_journal); + WARN_ON(!io_end->bio); spin_lock_irqsave(&ei->i_completed_io_lock, flags); wq = sbi->rsv_conversion_wq; @@ -318,12 +321,9 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) void ext4_put_io_end_defer(ext4_io_end_t *io_end) { if (refcount_dec_and_test(&io_end->count)) { - if (io_end->flag & EXT4_IO_END_FAILED || - (io_end->flag & EXT4_IO_END_UNWRITTEN && - !list_empty(&io_end->list_vec))) { - ext4_add_complete_io(io_end); - return; - } + if (ext4_io_end_defer_completion(io_end)) + return ext4_add_complete_io(io_end); + ext4_release_io_end(io_end); } } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a7f80ca01174..c7d39da7e733 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3627,7 +3627,7 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly) */ static void print_daily_error_info(struct timer_list *t) { - struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report); + struct ext4_sb_info *sbi = timer_container_of(sbi, t, s_err_report); struct super_block *sb = sbi->s_sb; struct ext4_super_block *es = sbi->s_es; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 8d15acbacc20..5a6fe1513fd2 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -338,7 +338,7 @@ xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, cmp = name_len - entry->e_name_len; if (!cmp) cmp = memcmp(name, entry->e_name, name_len); - if (cmp <= 0 && (sorted || cmp == 0)) + if (!cmp || (cmp < 0 && sorted)) break; } *pentry = entry; @@ -962,7 +962,7 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, * so we need to reserve credits for this eventuality */ if (inode && ext4_has_inline_data(inode)) - credits += ext4_writepage_trans_blocks(inode) + 1; + credits += ext4_chunk_trans_extent(inode, 1) + 1; /* We are done if ea_inode feature is not enabled. */ if (!ext4_has_feature_ea_inode(sb)) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 31e892842625..711ad80b38d0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3519,8 +3519,10 @@ reserve_block: return 0; } -static int f2fs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct folio **foliop, void **fsdata) +static int f2fs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, struct folio **foliop, + void **fsdata) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -3656,7 +3658,7 @@ fail: return err; } -static int f2fs_write_end(struct file *file, +static int f2fs_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9333a22b9a01..c78464792ceb 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3615,9 +3615,9 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count); int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag, bool readonly, bool need_lock); int f2fs_precache_extents(struct inode *inode); -int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int f2fs_fileattr_get(struct dentry *dentry, struct file_kattr *fa); int f2fs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6bd3de64f2a8..c677230699fd 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -35,6 +35,17 @@ #include <trace/events/f2fs.h> #include <uapi/linux/f2fs.h> +static void f2fs_zero_post_eof_page(struct inode *inode, loff_t new_size) +{ + loff_t old_size = i_size_read(inode); + + if (old_size >= new_size) + return; + + /* zero or drop pages only in range of [old_size, new_size] */ + truncate_pagecache(inode, old_size); +} + static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); @@ -103,8 +114,13 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); + filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, (folio->index + 1) << PAGE_SHIFT); + filemap_invalidate_unlock(inode->i_mapping); + file_update_time(vmf->vma->vm_file); filemap_invalidate_lock_shared(inode->i_mapping); + folio_lock(folio); if (unlikely(folio->mapping != inode->i_mapping || folio_pos(folio) > i_size_read(inode) || @@ -532,8 +548,9 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) return -EINVAL; } -static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) +static int f2fs_file_mmap_prepare(struct vm_area_desc *desc) { + struct file *file = desc->file; struct inode *inode = file_inode(file); if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) @@ -543,7 +560,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) return -EOPNOTSUPP; file_accessed(file); - vma->vm_ops = &f2fs_file_vm_ops; + desc->vm_ops = &f2fs_file_vm_ops; f2fs_down_read(&F2FS_I(inode)->i_sem); set_inode_flag(inode, FI_MMAP_FILE); @@ -1109,6 +1126,8 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, f2fs_down_write(&fi->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); + if (attr->ia_size > old_size) + f2fs_zero_post_eof_page(inode, attr->ia_size); truncate_setsize(inode, attr->ia_size); if (attr->ia_size <= old_size) @@ -1227,6 +1246,10 @@ static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; + filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len); + filemap_invalidate_unlock(inode->i_mapping); + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1510,6 +1533,8 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len); + f2fs_lock_op(sbi); f2fs_drop_extent_tree(inode); truncate_pagecache(inode, offset); @@ -1631,6 +1656,10 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; + filemap_invalidate_lock(mapping); + f2fs_zero_post_eof_page(inode, offset + len); + filemap_invalidate_unlock(mapping); + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1762,6 +1791,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) /* avoid gc operation during block exchange */ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(mapping); + + f2fs_zero_post_eof_page(inode, offset + len); truncate_pagecache(inode, offset); while (!ret && idx > pg_start) { @@ -1819,6 +1850,10 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset, if (err) return err; + filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len); + filemap_invalidate_unlock(inode->i_mapping); + f2fs_balance_fs(sbi, true); pg_start = ((unsigned long long)offset) >> PAGE_SHIFT; @@ -3356,7 +3391,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) } #endif -int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int f2fs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -3380,7 +3415,7 @@ int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa) } int f2fs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); u32 fsflags = fa->flags, mask = F2FS_SETTABLE_FS_FL; @@ -4860,6 +4895,10 @@ static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from) err = file_modified(file); if (err) return err; + + filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, iocb->ki_pos + iov_iter_count(from)); + filemap_invalidate_unlock(inode->i_mapping); return count; } @@ -5376,7 +5415,7 @@ const struct file_operations f2fs_file_operations = { .iopoll = iocb_bio_iopoll, .open = f2fs_file_open, .release = f2fs_release_file, - .mmap = f2fs_file_mmap, + .mmap_prepare = f2fs_file_mmap_prepare, .flush = f2fs_file_flush, .fsync = f2fs_sync_file, .fallocate = f2fs_fallocate, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1cb4cba7f961..bfe104db284e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2078,7 +2078,6 @@ write_node: if (!__write_node_folio(folio, false, &submitted, wbc, do_balance, io_type, NULL)) { - folio_unlock(folio); folio_batch_release(&fbatch); ret = -EIO; goto out; diff --git a/fs/fat/file.c b/fs/fat/file.c index e887e9ab7472..4fc49a614fb8 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -204,7 +204,7 @@ const struct file_operations fat_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, - .mmap = generic_file_mmap, + .mmap_prepare = generic_file_mmap_prepare, .release = fat_file_release, .unlocked_ioctl = fat_generic_ioctl, .compat_ioctl = compat_ptr_ioctl, diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 3852bb66358c..9648ed097816 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -219,13 +219,14 @@ static void fat_write_failed(struct address_space *mapping, loff_t to) } } -static int fat_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int fat_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int err; - err = cont_write_begin(file, mapping, pos, len, + err = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata, fat_get_block, &MSDOS_I(mapping->host)->mmu_private); if (err < 0) @@ -233,13 +234,14 @@ static int fat_write_begin(struct file *file, struct address_space *mapping, return err; } -static int fat_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int fat_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int err; - err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); + err = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); if (err < len) fat_write_failed(mapping, pos + len); if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 23e9b9371ec3..0b920ee40a7f 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -646,7 +646,7 @@ static const struct inode_operations msdos_dir_inode_operations = { static void setup(struct super_block *sb) { MSDOS_SB(sb)->dir_ops = &msdos_dir_inode_operations; - sb->s_d_op = &msdos_dentry_operations; + set_default_d_op(sb, &msdos_dentry_operations); sb->s_flags |= SB_NOATIME; } diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index dd910edd2404..5dbc4cbb8fce 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -1187,9 +1187,9 @@ static void setup(struct super_block *sb) { MSDOS_SB(sb)->dir_ops = &vfat_dir_inode_operations; if (MSDOS_SB(sb)->options.name_check != 's') - sb->s_d_op = &vfat_ci_dentry_ops; + set_default_d_op(sb, &vfat_ci_dentry_ops); else - sb->s_d_op = &vfat_dentry_ops; + set_default_d_op(sb, &vfat_dentry_ops); } static int vfat_fill_super(struct super_block *sb, struct fs_context *fc) diff --git a/fs/fhandle.c b/fs/fhandle.c index 3e092ae6d142..7c236f64cdea 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -88,7 +88,7 @@ static long do_sys_name_to_handle(const struct path *path, if (fh_flags & EXPORT_FH_CONNECTABLE) { handle->handle_type |= FILEID_IS_CONNECTABLE; if (d_is_dir(path->dentry)) - fh_flags |= FILEID_IS_DIR; + handle->handle_type |= FILEID_IS_DIR; } retval = 0; } @@ -168,23 +168,28 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, return err; } -static int get_path_from_fd(int fd, struct path *root) +static int get_path_anchor(int fd, struct path *root) { - if (fd == AT_FDCWD) { - struct fs_struct *fs = current->fs; - spin_lock(&fs->lock); - *root = fs->pwd; - path_get(root); - spin_unlock(&fs->lock); - } else { + if (fd >= 0) { CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; *root = fd_file(f)->f_path; path_get(root); + return 0; } - return 0; + if (fd == AT_FDCWD) { + get_fs_pwd(current->fs, root); + return 0; + } + + if (fd == FD_PIDFS_ROOT) { + pidfs_get_root(root); + return 0; + } + + return -EBADF; } static int vfs_dentry_acceptable(void *context, struct dentry *dentry) @@ -323,13 +328,24 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, { int retval = 0; struct file_handle f_handle; - struct file_handle *handle = NULL; + struct file_handle *handle __free(kfree) = NULL; struct handle_to_path_ctx ctx = {}; const struct export_operations *eops; - retval = get_path_from_fd(mountdirfd, &ctx.root); + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) + return -EFAULT; + + if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || + (f_handle.handle_bytes == 0)) + return -EINVAL; + + if (f_handle.handle_type < 0 || + FILEID_USER_FLAGS(f_handle.handle_type) & ~FILEID_VALID_USER_FLAGS) + return -EINVAL; + + retval = get_path_anchor(mountdirfd, &ctx.root); if (retval) - goto out_err; + return retval; eops = ctx.root.mnt->mnt_sb->s_export_op; if (eops && eops->permission) @@ -339,21 +355,6 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, if (retval) goto out_path; - if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) { - retval = -EFAULT; - goto out_path; - } - if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || - (f_handle.handle_bytes == 0)) { - retval = -EINVAL; - goto out_path; - } - if (f_handle.handle_type < 0 || - FILEID_USER_FLAGS(f_handle.handle_type) & ~FILEID_VALID_USER_FLAGS) { - retval = -EINVAL; - goto out_path; - } - handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes), GFP_KERNEL); if (!handle) { @@ -366,7 +367,7 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, &ufh->f_handle, f_handle.handle_bytes)) { retval = -EFAULT; - goto out_handle; + goto out_path; } /* @@ -384,11 +385,8 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, handle->handle_type &= ~FILEID_USER_FLAGS_MASK; retval = do_handle_to_path(handle, path, &ctx); -out_handle: - kfree(handle); out_path: path_put(&ctx.root); -out_err: return retval; } diff --git a/fs/file.c b/fs/file.c index 3a3146664cf3..6d2275c3be9c 100644 --- a/fs/file.c +++ b/fs/file.c @@ -197,6 +197,21 @@ static struct fdtable *alloc_fdtable(unsigned int slots_wanted) return ERR_PTR(-EMFILE); } + /* + * Check if the allocation size would exceed INT_MAX. kvmalloc_array() + * and kvmalloc() will warn if the allocation size is greater than + * INT_MAX, as filp_cachep objects are not __GFP_NOWARN. + * + * This can happen when sysctl_nr_open is set to a very high value and + * a process tries to use a file descriptor near that limit. For example, + * if sysctl_nr_open is set to 1073741816 (0x3ffffff8) - which is what + * systemd typically sets it to - then trying to use a file descriptor + * close to that value will require allocating a file descriptor table + * that exceeds 8GB in size. + */ + if (unlikely(nr > INT_MAX / sizeof(struct file *))) + return ERR_PTR(-EMFILE); + fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) goto out; @@ -1198,8 +1213,12 @@ bool file_seek_cur_needs_f_lock(struct file *file) if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared) return false; - VFS_WARN_ON_ONCE((file_count(file) > 1) && - !mutex_is_locked(&file->f_pos_lock)); + /* + * Note that we are not guaranteed to be called after fdget_pos() on + * this file obj, in which case the caller is expected to provide the + * appropriate locking. + */ + return true; } diff --git a/fs/file_attr.c b/fs/file_attr.c new file mode 100644 index 000000000000..12424d4945d0 --- /dev/null +++ b/fs/file_attr.c @@ -0,0 +1,498 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/fs.h> +#include <linux/security.h> +#include <linux/fscrypt.h> +#include <linux/fileattr.h> +#include <linux/export.h> +#include <linux/syscalls.h> +#include <linux/namei.h> + +#include "internal.h" + +/** + * fileattr_fill_xflags - initialize fileattr with xflags + * @fa: fileattr pointer + * @xflags: FS_XFLAG_* flags + * + * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags). All + * other fields are zeroed. + */ +void fileattr_fill_xflags(struct file_kattr *fa, u32 xflags) +{ + memset(fa, 0, sizeof(*fa)); + fa->fsx_valid = true; + fa->fsx_xflags = xflags; + if (fa->fsx_xflags & FS_XFLAG_IMMUTABLE) + fa->flags |= FS_IMMUTABLE_FL; + if (fa->fsx_xflags & FS_XFLAG_APPEND) + fa->flags |= FS_APPEND_FL; + if (fa->fsx_xflags & FS_XFLAG_SYNC) + fa->flags |= FS_SYNC_FL; + if (fa->fsx_xflags & FS_XFLAG_NOATIME) + fa->flags |= FS_NOATIME_FL; + if (fa->fsx_xflags & FS_XFLAG_NODUMP) + fa->flags |= FS_NODUMP_FL; + if (fa->fsx_xflags & FS_XFLAG_DAX) + fa->flags |= FS_DAX_FL; + if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT) + fa->flags |= FS_PROJINHERIT_FL; +} +EXPORT_SYMBOL(fileattr_fill_xflags); + +/** + * fileattr_fill_flags - initialize fileattr with flags + * @fa: fileattr pointer + * @flags: FS_*_FL flags + * + * Set ->flags, ->flags_valid and ->fsx_xflags (translated flags). + * All other fields are zeroed. + */ +void fileattr_fill_flags(struct file_kattr *fa, u32 flags) +{ + memset(fa, 0, sizeof(*fa)); + fa->flags_valid = true; + fa->flags = flags; + if (fa->flags & FS_SYNC_FL) + fa->fsx_xflags |= FS_XFLAG_SYNC; + if (fa->flags & FS_IMMUTABLE_FL) + fa->fsx_xflags |= FS_XFLAG_IMMUTABLE; + if (fa->flags & FS_APPEND_FL) + fa->fsx_xflags |= FS_XFLAG_APPEND; + if (fa->flags & FS_NODUMP_FL) + fa->fsx_xflags |= FS_XFLAG_NODUMP; + if (fa->flags & FS_NOATIME_FL) + fa->fsx_xflags |= FS_XFLAG_NOATIME; + if (fa->flags & FS_DAX_FL) + fa->fsx_xflags |= FS_XFLAG_DAX; + if (fa->flags & FS_PROJINHERIT_FL) + fa->fsx_xflags |= FS_XFLAG_PROJINHERIT; +} +EXPORT_SYMBOL(fileattr_fill_flags); + +/** + * vfs_fileattr_get - retrieve miscellaneous file attributes + * @dentry: the object to retrieve from + * @fa: fileattr pointer + * + * Call i_op->fileattr_get() callback, if exists. + * + * Return: 0 on success, or a negative error on failure. + */ +int vfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) +{ + struct inode *inode = d_inode(dentry); + int error; + + if (!inode->i_op->fileattr_get) + return -EOPNOTSUPP; + + error = security_inode_file_getattr(dentry, fa); + if (error) + return error; + + return inode->i_op->fileattr_get(dentry, fa); +} +EXPORT_SYMBOL(vfs_fileattr_get); + +static void fileattr_to_file_attr(const struct file_kattr *fa, + struct file_attr *fattr) +{ + __u32 mask = FS_XFLAGS_MASK; + + memset(fattr, 0, sizeof(struct file_attr)); + fattr->fa_xflags = fa->fsx_xflags & mask; + fattr->fa_extsize = fa->fsx_extsize; + fattr->fa_nextents = fa->fsx_nextents; + fattr->fa_projid = fa->fsx_projid; + fattr->fa_cowextsize = fa->fsx_cowextsize; +} + +/** + * copy_fsxattr_to_user - copy fsxattr to userspace. + * @fa: fileattr pointer + * @ufa: fsxattr user pointer + * + * Return: 0 on success, or -EFAULT on failure. + */ +int copy_fsxattr_to_user(const struct file_kattr *fa, struct fsxattr __user *ufa) +{ + struct fsxattr xfa; + __u32 mask = FS_XFLAGS_MASK; + + memset(&xfa, 0, sizeof(xfa)); + xfa.fsx_xflags = fa->fsx_xflags & mask; + xfa.fsx_extsize = fa->fsx_extsize; + xfa.fsx_nextents = fa->fsx_nextents; + xfa.fsx_projid = fa->fsx_projid; + xfa.fsx_cowextsize = fa->fsx_cowextsize; + + if (copy_to_user(ufa, &xfa, sizeof(xfa))) + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL(copy_fsxattr_to_user); + +static int file_attr_to_fileattr(const struct file_attr *fattr, + struct file_kattr *fa) +{ + __u64 mask = FS_XFLAGS_MASK; + + if (fattr->fa_xflags & ~mask) + return -EINVAL; + + fileattr_fill_xflags(fa, fattr->fa_xflags); + fa->fsx_xflags &= ~FS_XFLAG_RDONLY_MASK; + fa->fsx_extsize = fattr->fa_extsize; + fa->fsx_projid = fattr->fa_projid; + fa->fsx_cowextsize = fattr->fa_cowextsize; + + return 0; +} + +static int copy_fsxattr_from_user(struct file_kattr *fa, + struct fsxattr __user *ufa) +{ + struct fsxattr xfa; + __u32 mask = FS_XFLAGS_MASK; + + if (copy_from_user(&xfa, ufa, sizeof(xfa))) + return -EFAULT; + + if (xfa.fsx_xflags & ~mask) + return -EOPNOTSUPP; + + fileattr_fill_xflags(fa, xfa.fsx_xflags); + fa->fsx_xflags &= ~FS_XFLAG_RDONLY_MASK; + fa->fsx_extsize = xfa.fsx_extsize; + fa->fsx_nextents = xfa.fsx_nextents; + fa->fsx_projid = xfa.fsx_projid; + fa->fsx_cowextsize = xfa.fsx_cowextsize; + + return 0; +} + +/* + * Generic function to check FS_IOC_FSSETXATTR/FS_IOC_SETFLAGS values and reject + * any invalid configurations. + * + * Note: must be called with inode lock held. + */ +static int fileattr_set_prepare(struct inode *inode, + const struct file_kattr *old_ma, + struct file_kattr *fa) +{ + int err; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + */ + if ((fa->flags ^ old_ma->flags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) && + !capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + + err = fscrypt_prepare_setflags(inode, old_ma->flags, fa->flags); + if (err) + return err; + + /* + * Project Quota ID state is only allowed to change from within the init + * namespace. Enforce that restriction only if we are trying to change + * the quota ID state. Everything else is allowed in user namespaces. + */ + if (current_user_ns() != &init_user_ns) { + if (old_ma->fsx_projid != fa->fsx_projid) + return -EINVAL; + if ((old_ma->fsx_xflags ^ fa->fsx_xflags) & + FS_XFLAG_PROJINHERIT) + return -EINVAL; + } else { + /* + * Caller is allowed to change the project ID. If it is being + * changed, make sure that the new value is valid. + */ + if (old_ma->fsx_projid != fa->fsx_projid && + !projid_valid(make_kprojid(&init_user_ns, fa->fsx_projid))) + return -EINVAL; + } + + /* Check extent size hints. */ + if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode)) + return -EINVAL; + + if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) && + !S_ISDIR(inode->i_mode)) + return -EINVAL; + + if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) && + !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + return -EINVAL; + + /* + * It is only valid to set the DAX flag on regular files and + * directories on filesystems. + */ + if ((fa->fsx_xflags & FS_XFLAG_DAX) && + !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) + return -EINVAL; + + /* Extent size hints of zero turn off the flags. */ + if (fa->fsx_extsize == 0) + fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT); + if (fa->fsx_cowextsize == 0) + fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; + + return 0; +} + +/** + * vfs_fileattr_set - change miscellaneous file attributes + * @idmap: idmap of the mount + * @dentry: the object to change + * @fa: fileattr pointer + * + * After verifying permissions, call i_op->fileattr_set() callback, if + * exists. + * + * Verifying attributes involves retrieving current attributes with + * i_op->fileattr_get(), this also allows initializing attributes that have + * not been set by the caller to current values. Inode lock is held + * thoughout to prevent racing with another instance. + * + * Return: 0 on success, or a negative error on failure. + */ +int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, + struct file_kattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct file_kattr old_ma = {}; + int err; + + if (!inode->i_op->fileattr_set) + return -EOPNOTSUPP; + + if (!inode_owner_or_capable(idmap, inode)) + return -EPERM; + + inode_lock(inode); + err = vfs_fileattr_get(dentry, &old_ma); + if (!err) { + /* initialize missing bits from old_ma */ + if (fa->flags_valid) { + fa->fsx_xflags |= old_ma.fsx_xflags & ~FS_XFLAG_COMMON; + fa->fsx_extsize = old_ma.fsx_extsize; + fa->fsx_nextents = old_ma.fsx_nextents; + fa->fsx_projid = old_ma.fsx_projid; + fa->fsx_cowextsize = old_ma.fsx_cowextsize; + } else { + fa->flags |= old_ma.flags & ~FS_COMMON_FL; + } + + err = fileattr_set_prepare(inode, &old_ma, fa); + if (err) + goto out; + err = security_inode_file_setattr(dentry, fa); + if (err) + goto out; + err = inode->i_op->fileattr_set(idmap, dentry, fa); + if (err) + goto out; + } + +out: + inode_unlock(inode); + return err; +} +EXPORT_SYMBOL(vfs_fileattr_set); + +int ioctl_getflags(struct file *file, unsigned int __user *argp) +{ + struct file_kattr fa = { .flags_valid = true }; /* hint only */ + int err; + + err = vfs_fileattr_get(file->f_path.dentry, &fa); + if (err == -EOPNOTSUPP) + err = -ENOIOCTLCMD; + if (!err) + err = put_user(fa.flags, argp); + return err; +} +EXPORT_SYMBOL(ioctl_getflags); + +int ioctl_setflags(struct file *file, unsigned int __user *argp) +{ + struct mnt_idmap *idmap = file_mnt_idmap(file); + struct dentry *dentry = file->f_path.dentry; + struct file_kattr fa; + unsigned int flags; + int err; + + err = get_user(flags, argp); + if (!err) { + err = mnt_want_write_file(file); + if (!err) { + fileattr_fill_flags(&fa, flags); + err = vfs_fileattr_set(idmap, dentry, &fa); + mnt_drop_write_file(file); + if (err == -EOPNOTSUPP) + err = -ENOIOCTLCMD; + } + } + return err; +} +EXPORT_SYMBOL(ioctl_setflags); + +int ioctl_fsgetxattr(struct file *file, void __user *argp) +{ + struct file_kattr fa = { .fsx_valid = true }; /* hint only */ + int err; + + err = vfs_fileattr_get(file->f_path.dentry, &fa); + if (err == -EOPNOTSUPP) + err = -ENOIOCTLCMD; + if (!err) + err = copy_fsxattr_to_user(&fa, argp); + + return err; +} +EXPORT_SYMBOL(ioctl_fsgetxattr); + +int ioctl_fssetxattr(struct file *file, void __user *argp) +{ + struct mnt_idmap *idmap = file_mnt_idmap(file); + struct dentry *dentry = file->f_path.dentry; + struct file_kattr fa; + int err; + + err = copy_fsxattr_from_user(&fa, argp); + if (!err) { + err = mnt_want_write_file(file); + if (!err) { + err = vfs_fileattr_set(idmap, dentry, &fa); + mnt_drop_write_file(file); + if (err == -EOPNOTSUPP) + err = -ENOIOCTLCMD; + } + } + return err; +} +EXPORT_SYMBOL(ioctl_fssetxattr); + +SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename, + struct file_attr __user *, ufattr, size_t, usize, + unsigned int, at_flags) +{ + struct path filepath __free(path_put) = {}; + struct filename *name __free(putname) = NULL; + unsigned int lookup_flags = 0; + struct file_attr fattr; + struct file_kattr fa; + int error; + + BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0); + BUILD_BUG_ON(sizeof(struct file_attr) != FILE_ATTR_SIZE_LATEST); + + if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; + + if (!(at_flags & AT_SYMLINK_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + + if (usize > PAGE_SIZE) + return -E2BIG; + + if (usize < FILE_ATTR_SIZE_VER0) + return -EINVAL; + + name = getname_maybe_null(filename, at_flags); + if (IS_ERR(name)) + return PTR_ERR(name); + + if (!name && dfd >= 0) { + CLASS(fd, f)(dfd); + if (fd_empty(f)) + return -EBADF; + + filepath = fd_file(f)->f_path; + path_get(&filepath); + } else { + error = filename_lookup(dfd, name, lookup_flags, &filepath, + NULL); + if (error) + return error; + } + + error = vfs_fileattr_get(filepath.dentry, &fa); + if (error) + return error; + + fileattr_to_file_attr(&fa, &fattr); + error = copy_struct_to_user(ufattr, usize, &fattr, + sizeof(struct file_attr), NULL); + + return error; +} + +SYSCALL_DEFINE5(file_setattr, int, dfd, const char __user *, filename, + struct file_attr __user *, ufattr, size_t, usize, + unsigned int, at_flags) +{ + struct path filepath __free(path_put) = {}; + struct filename *name __free(putname) = NULL; + unsigned int lookup_flags = 0; + struct file_attr fattr; + struct file_kattr fa; + int error; + + BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0); + BUILD_BUG_ON(sizeof(struct file_attr) != FILE_ATTR_SIZE_LATEST); + + if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; + + if (!(at_flags & AT_SYMLINK_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + + if (usize > PAGE_SIZE) + return -E2BIG; + + if (usize < FILE_ATTR_SIZE_VER0) + return -EINVAL; + + error = copy_struct_from_user(&fattr, sizeof(struct file_attr), ufattr, + usize); + if (error) + return error; + + error = file_attr_to_fileattr(&fattr, &fa); + if (error) + return error; + + name = getname_maybe_null(filename, at_flags); + if (IS_ERR(name)) + return PTR_ERR(name); + + if (!name && dfd >= 0) { + CLASS(fd, f)(dfd); + if (fd_empty(f)) + return -EBADF; + + filepath = fd_file(f)->f_path; + path_get(&filepath); + } else { + error = filename_lookup(dfd, name, lookup_flags, &filepath, + NULL); + if (error) + return error; + } + + error = mnt_want_write(filepath.mnt); + if (!error) { + error = vfs_fileattr_set(mnt_idmap(filepath.mnt), + filepath.dentry, &fa); + mnt_drop_write(filepath.mnt); + } + + return error; +} diff --git a/fs/file_table.c b/fs/file_table.c index 138114d64307..81c72576e548 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -52,17 +52,20 @@ struct backing_file { }; }; -static inline struct backing_file *backing_file(struct file *f) -{ - return container_of(f, struct backing_file, file); -} +#define backing_file(f) container_of(f, struct backing_file, file) -struct path *backing_file_user_path(struct file *f) +struct path *backing_file_user_path(const struct file *f) { return &backing_file(f)->user_path; } EXPORT_SYMBOL_GPL(backing_file_user_path); +void backing_file_set_user_path(struct file *f, const struct path *path) +{ + backing_file(f)->user_path = *path; +} +EXPORT_SYMBOL_GPL(backing_file_set_user_path); + static inline void file_free(struct file *f) { security_file_free(f); @@ -196,7 +199,7 @@ static int init_file(struct file *f, int flags, const struct cred *cred) file_ref_init(&f->f_ref, 1); /* * Disable permission and pre-content events for all files by default. - * They may be enabled later by file_set_fsnotify_mode_from_watchers(). + * They may be enabled later by fsnotify_open_perm_and_set_mode(). */ file_set_fsnotify_mode(f, FMODE_NONOTIFY_PERM); return 0; diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 64c2d0814ed6..28be762ac1c6 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -17,12 +17,10 @@ void set_fs_root(struct fs_struct *fs, const struct path *path) struct path old_root; path_get(path); - spin_lock(&fs->lock); - write_seqcount_begin(&fs->seq); + write_seqlock(&fs->seq); old_root = fs->root; fs->root = *path; - write_seqcount_end(&fs->seq); - spin_unlock(&fs->lock); + write_sequnlock(&fs->seq); if (old_root.dentry) path_put(&old_root); } @@ -36,12 +34,10 @@ void set_fs_pwd(struct fs_struct *fs, const struct path *path) struct path old_pwd; path_get(path); - spin_lock(&fs->lock); - write_seqcount_begin(&fs->seq); + write_seqlock(&fs->seq); old_pwd = fs->pwd; fs->pwd = *path; - write_seqcount_end(&fs->seq); - spin_unlock(&fs->lock); + write_sequnlock(&fs->seq); if (old_pwd.dentry) path_put(&old_pwd); @@ -67,16 +63,14 @@ void chroot_fs_refs(const struct path *old_root, const struct path *new_root) fs = p->fs; if (fs) { int hits = 0; - spin_lock(&fs->lock); - write_seqcount_begin(&fs->seq); + write_seqlock(&fs->seq); hits += replace_path(&fs->root, old_root, new_root); hits += replace_path(&fs->pwd, old_root, new_root); - write_seqcount_end(&fs->seq); while (hits--) { count++; path_get(new_root); } - spin_unlock(&fs->lock); + write_sequnlock(&fs->seq); } task_unlock(p); } @@ -99,10 +93,10 @@ void exit_fs(struct task_struct *tsk) if (fs) { int kill; task_lock(tsk); - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); tsk->fs = NULL; kill = !--fs->users; - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); task_unlock(tsk); if (kill) free_fs_struct(fs); @@ -116,16 +110,15 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old) if (fs) { fs->users = 1; fs->in_exec = 0; - spin_lock_init(&fs->lock); - seqcount_spinlock_init(&fs->seq, &fs->lock); + seqlock_init(&fs->seq); fs->umask = old->umask; - spin_lock(&old->lock); + read_seqlock_excl(&old->seq); fs->root = old->root; path_get(&fs->root); fs->pwd = old->pwd; path_get(&fs->pwd); - spin_unlock(&old->lock); + read_sequnlock_excl(&old->seq); } return fs; } @@ -140,10 +133,10 @@ int unshare_fs_struct(void) return -ENOMEM; task_lock(current); - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); kill = !--fs->users; current->fs = new_fs; - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); task_unlock(current); if (kill) @@ -162,7 +155,6 @@ EXPORT_SYMBOL(current_umask); /* to be mentioned only in INIT_TASK */ struct fs_struct init_fs = { .users = 1, - .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock), - .seq = SEQCNT_SPINLOCK_ZERO(init_fs.seq, &init_fs.lock), + .seq = __SEQLOCK_UNLOCKED(init_fs.seq), .umask = 0022, }; diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index ca215a3cba3e..a774166264de 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -2,6 +2,7 @@ config FUSE_FS tristate "FUSE (Filesystem in Userspace) support" select FS_POSIX_ACL + select FS_IOMAP help With FUSE it is possible to implement a fully functional filesystem in a userspace program. diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 2a730d88cc3b..bb407705603c 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -11,6 +11,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/fs_context.h> +#include <linux/namei.h> #define FUSE_CTL_SUPER_MAGIC 0x65735543 @@ -212,7 +213,6 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, struct dentry *dentry; struct inode *inode; - BUG_ON(fc->ctl_ndents >= FUSE_CTL_NUM_DENTRIES); dentry = d_alloc_name(parent, name); if (!dentry) return NULL; @@ -236,8 +236,6 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, inode->i_private = fc; d_add(dentry, inode); - fc->ctl_dentry[fc->ctl_ndents++] = dentry; - return dentry; } @@ -280,27 +278,29 @@ int fuse_ctl_add_conn(struct fuse_conn *fc) return -ENOMEM; } +static void remove_one(struct dentry *dentry) +{ + d_inode(dentry)->i_private = NULL; +} + /* * Remove a connection from the control filesystem (if it exists). * Caller must hold fuse_mutex */ void fuse_ctl_remove_conn(struct fuse_conn *fc) { - int i; + struct dentry *dentry; + char name[32]; if (!fuse_control_sb || fc->no_control) return; - for (i = fc->ctl_ndents - 1; i >= 0; i--) { - struct dentry *dentry = fc->ctl_dentry[i]; - d_inode(dentry)->i_private = NULL; - if (!i) { - /* Get rid of submounts: */ - d_invalidate(dentry); - } - dput(dentry); + sprintf(name, "%u", fc->dev); + dentry = lookup_noperm_positive_unlocked(&QSTR(name), fuse_control_sb->s_root); + if (!IS_ERR(dentry)) { + simple_recursive_removal(dentry, remove_one); + dput(dentry); // paired with lookup_noperm_positive_unlocked() } - drop_nlink(d_inode(fuse_control_sb->s_root)); } static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fsc) @@ -346,12 +346,8 @@ static int fuse_ctl_init_fs_context(struct fs_context *fsc) static void fuse_ctl_kill_sb(struct super_block *sb) { - struct fuse_conn *fc; - mutex_lock(&fuse_mutex); fuse_control_sb = NULL; - list_for_each_entry(fc, &fuse_conn_list, entry) - fc->ctl_ndents = 0; mutex_unlock(&fuse_mutex); kill_litter_super(sb); diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 0502bf3cdf6a..ac6d4c1064cc 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -10,7 +10,6 @@ #include <linux/dax.h> #include <linux/uio.h> #include <linux/pagemap.h> -#include <linux/pfn_t.h> #include <linux/iomap.h> #include <linux/interval_tree.h> @@ -757,7 +756,7 @@ static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, unsigned int order, vm_fault_t ret; struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; - pfn_t pfn; + unsigned long pfn; int error = 0; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_conn_dax *fcd = fc->dax; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6dcbaa218b7a..e80cd8f2c049 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -23,6 +23,7 @@ #include <linux/swap.h> #include <linux/splice.h> #include <linux/sched.h> +#include <linux/seq_file.h> #define CREATE_TRACE_POINTS #include "fuse_trace.h" @@ -45,7 +46,7 @@ bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list) return time_is_before_jiffies(req->create_time + fc->timeout.req_timeout); } -bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing) +static bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing) { int i; @@ -816,7 +817,7 @@ static int unlock_request(struct fuse_req *req) return err; } -void fuse_copy_init(struct fuse_copy_state *cs, int write, +void fuse_copy_init(struct fuse_copy_state *cs, bool write, struct iov_iter *iter) { memset(cs, 0, sizeof(*cs)); @@ -955,10 +956,10 @@ static int fuse_check_folio(struct folio *folio) * folio that was originally in @pagep will lose a reference and the new * folio returned in @pagep will carry a reference. */ -static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) +static int fuse_try_move_folio(struct fuse_copy_state *cs, struct folio **foliop) { int err; - struct folio *oldfolio = page_folio(*pagep); + struct folio *oldfolio = *foliop; struct folio *newfolio; struct pipe_buffer *buf = cs->pipebufs; @@ -979,7 +980,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) cs->pipebufs++; cs->nr_segs--; - if (cs->len != PAGE_SIZE) + if (cs->len != folio_size(oldfolio)) goto out_fallback; if (!pipe_buf_try_steal(cs->pipe, buf)) @@ -1025,7 +1026,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (test_bit(FR_ABORTED, &cs->req->flags)) err = -ENOENT; else - *pagep = &newfolio->page; + *foliop = newfolio; spin_unlock(&cs->req->waitq.lock); if (err) { @@ -1058,8 +1059,8 @@ out_fallback: goto out_put_old; } -static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, - unsigned offset, unsigned count) +static int fuse_ref_folio(struct fuse_copy_state *cs, struct folio *folio, + unsigned offset, unsigned count) { struct pipe_buffer *buf; int err; @@ -1067,17 +1068,17 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, if (cs->nr_segs >= cs->pipe->max_usage) return -EIO; - get_page(page); + folio_get(folio); err = unlock_request(cs->req); if (err) { - put_page(page); + folio_put(folio); return err; } fuse_copy_finish(cs); buf = cs->pipebufs; - buf->page = page; + buf->page = &folio->page; buf->offset = offset; buf->len = count; @@ -1089,20 +1090,24 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, } /* - * Copy a page in the request to/from the userspace buffer. Must be + * Copy a folio in the request to/from the userspace buffer. Must be * done atomically */ -static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, - unsigned offset, unsigned count, int zeroing) +static int fuse_copy_folio(struct fuse_copy_state *cs, struct folio **foliop, + unsigned offset, unsigned count, int zeroing) { int err; - struct page *page = *pagep; + struct folio *folio = *foliop; + size_t size; - if (page && zeroing && count < PAGE_SIZE) - clear_highpage(page); + if (folio) { + size = folio_size(folio); + if (zeroing && count < size) + folio_zero_range(folio, 0, size); + } while (count) { - if (cs->write && cs->pipebufs && page) { + if (cs->write && cs->pipebufs && folio) { /* * Can't control lifetime of pipe buffers, so always * copy user pages. @@ -1112,12 +1117,12 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, if (err) return err; } else { - return fuse_ref_page(cs, page, offset, count); + return fuse_ref_folio(cs, folio, offset, count); } } else if (!cs->len) { - if (cs->move_pages && page && - offset == 0 && count == PAGE_SIZE) { - err = fuse_try_move_page(cs, pagep); + if (cs->move_folios && folio && + offset == 0 && count == size) { + err = fuse_try_move_folio(cs, foliop); if (err <= 0) return err; } else { @@ -1126,22 +1131,30 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, return err; } } - if (page) { - void *mapaddr = kmap_local_page(page); - void *buf = mapaddr + offset; - offset += fuse_copy_do(cs, &buf, &count); + if (folio) { + void *mapaddr = kmap_local_folio(folio, offset); + void *buf = mapaddr; + unsigned int copy = count; + unsigned int bytes_copied; + + if (folio_test_highmem(folio) && count > PAGE_SIZE - offset_in_page(offset)) + copy = PAGE_SIZE - offset_in_page(offset); + + bytes_copied = fuse_copy_do(cs, &buf, ©); kunmap_local(mapaddr); + offset += bytes_copied; + count -= bytes_copied; } else offset += fuse_copy_do(cs, NULL, &count); } - if (page && !cs->write) - flush_dcache_page(page); + if (folio && !cs->write) + flush_dcache_folio(folio); return 0; } -/* Copy pages in the request to/from userspace buffer */ -static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, - int zeroing) +/* Copy folios in the request to/from userspace buffer */ +static int fuse_copy_folios(struct fuse_copy_state *cs, unsigned nbytes, + int zeroing) { unsigned i; struct fuse_req *req = cs->req; @@ -1151,23 +1164,12 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, int err; unsigned int offset = ap->descs[i].offset; unsigned int count = min(nbytes, ap->descs[i].length); - struct page *orig, *pagep; - orig = pagep = &ap->folios[i]->page; - - err = fuse_copy_page(cs, &pagep, offset, count, zeroing); + err = fuse_copy_folio(cs, &ap->folios[i], offset, count, zeroing); if (err) return err; nbytes -= count; - - /* - * fuse_copy_page may have moved a page from a pipe instead of - * copying into our given page, so update the folios if it was - * replaced. - */ - if (pagep != orig) - ap->folios[i] = page_folio(pagep); } return 0; } @@ -1197,7 +1199,7 @@ int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, for (i = 0; !err && i < numargs; i++) { struct fuse_arg *arg = &args[i]; if (i == numargs - 1 && argpages) - err = fuse_copy_pages(cs, arg->size, zeroing); + err = fuse_copy_folios(cs, arg->size, zeroing); else err = fuse_copy_one(cs, arg->value, arg->size); } @@ -1538,7 +1540,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to) if (!user_backed_iter(to)) return -EINVAL; - fuse_copy_init(&cs, 1, to); + fuse_copy_init(&cs, true, to); return fuse_dev_do_read(fud, file, &cs, iov_iter_count(to)); } @@ -1561,7 +1563,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, if (!bufs) return -ENOMEM; - fuse_copy_init(&cs, 1, NULL); + fuse_copy_init(&cs, true, NULL); cs.pipebufs = bufs; cs.pipe = pipe; ret = fuse_dev_do_read(fud, in, &cs, len); @@ -1786,20 +1788,23 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, num = outarg.size; while (num) { struct folio *folio; - struct page *page; - unsigned int this_num; + unsigned int folio_offset; + unsigned int nr_bytes; + unsigned int nr_pages; folio = filemap_grab_folio(mapping, index); err = PTR_ERR(folio); if (IS_ERR(folio)) goto out_iput; - page = &folio->page; - this_num = min_t(unsigned, num, folio_size(folio) - offset); - err = fuse_copy_page(cs, &page, offset, this_num, 0); + folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset; + nr_bytes = min_t(unsigned, num, folio_size(folio) - folio_offset); + nr_pages = (offset + nr_bytes + PAGE_SIZE - 1) >> PAGE_SHIFT; + + err = fuse_copy_folio(cs, &folio, folio_offset, nr_bytes, 0); if (!folio_test_uptodate(folio) && !err && offset == 0 && - (this_num == folio_size(folio) || file_size == end)) { - folio_zero_segment(folio, this_num, folio_size(folio)); + (nr_bytes == folio_size(folio) || file_size == end)) { + folio_zero_segment(folio, nr_bytes, folio_size(folio)); folio_mark_uptodate(folio); } folio_unlock(folio); @@ -1808,9 +1813,9 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, if (err) goto out_iput; - num -= this_num; + num -= nr_bytes; offset = 0; - index++; + index += nr_pages; } err = 0; @@ -1849,7 +1854,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, unsigned int num; unsigned int offset; size_t total_len = 0; - unsigned int num_pages, cur_pages = 0; + unsigned int num_pages; struct fuse_conn *fc = fm->fc; struct fuse_retrieve_args *ra; size_t args_size = sizeof(*ra); @@ -1867,6 +1872,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; num_pages = min(num_pages, fc->max_pages); + num = min(num, num_pages << PAGE_SHIFT); args_size += num_pages * (sizeof(ap->folios[0]) + sizeof(ap->descs[0])); @@ -1887,25 +1893,29 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, index = outarg->offset >> PAGE_SHIFT; - while (num && cur_pages < num_pages) { + while (num) { struct folio *folio; - unsigned int this_num; + unsigned int folio_offset; + unsigned int nr_bytes; + unsigned int nr_pages; folio = filemap_get_folio(mapping, index); if (IS_ERR(folio)) break; - this_num = min_t(unsigned, num, PAGE_SIZE - offset); + folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset; + nr_bytes = min(folio_size(folio) - folio_offset, num); + nr_pages = (offset + nr_bytes + PAGE_SIZE - 1) >> PAGE_SHIFT; + ap->folios[ap->num_folios] = folio; - ap->descs[ap->num_folios].offset = offset; - ap->descs[ap->num_folios].length = this_num; + ap->descs[ap->num_folios].offset = folio_offset; + ap->descs[ap->num_folios].length = nr_bytes; ap->num_folios++; - cur_pages++; offset = 0; - num -= this_num; - total_len += this_num; - index++; + num -= nr_bytes; + total_len += nr_bytes; + index += nr_pages; } ra->inarg.offset = outarg->offset; ra->inarg.size = total_len; @@ -2021,11 +2031,24 @@ static int fuse_notify_resend(struct fuse_conn *fc) return 0; } +/* + * Increments the fuse connection epoch. This will result of dentries from + * previous epochs to be invalidated. + * + * XXX optimization: add call to shrink_dcache_sb()? + */ +static int fuse_notify_inc_epoch(struct fuse_conn *fc) +{ + atomic_inc(&fc->epoch); + + return 0; +} + static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { - /* Don't try to move pages (yet) */ - cs->move_pages = 0; + /* Don't try to move folios (yet) */ + cs->move_folios = false; switch (code) { case FUSE_NOTIFY_POLL: @@ -2049,6 +2072,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_RESEND: return fuse_notify_resend(fc); + case FUSE_NOTIFY_INC_EPOCH: + return fuse_notify_inc_epoch(fc); + default: fuse_copy_finish(cs); return -EINVAL; @@ -2173,7 +2199,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, spin_unlock(&fpq->lock); cs->req = req; if (!req->args->page_replace) - cs->move_pages = 0; + cs->move_folios = false; if (oh.error) err = nbytes != sizeof(oh) ? -EINVAL : 0; @@ -2211,7 +2237,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) if (!user_backed_iter(from)) return -EINVAL; - fuse_copy_init(&cs, 0, from); + fuse_copy_init(&cs, false, from); return fuse_dev_do_write(fud, &cs, iov_iter_count(from)); } @@ -2285,13 +2311,13 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, } pipe_unlock(pipe); - fuse_copy_init(&cs, 0, NULL); + fuse_copy_init(&cs, false, NULL); cs.pipebufs = bufs; cs.nr_segs = nbuf; cs.pipe = pipe; if (flags & SPLICE_F_MOVE) - cs.move_pages = 1; + cs.move_folios = true; ret = fuse_dev_do_write(fud, &cs, len); @@ -2602,6 +2628,17 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, } } +#ifdef CONFIG_PROC_FS +static void fuse_dev_show_fdinfo(struct seq_file *seq, struct file *file) +{ + struct fuse_dev *fud = fuse_get_dev(file); + if (!fud) + return; + + seq_printf(seq, "fuse_connection:\t%u\n", fud->fc->dev); +} +#endif + const struct file_operations fuse_dev_operations = { .owner = THIS_MODULE, .open = fuse_dev_open, @@ -2617,6 +2654,9 @@ const struct file_operations fuse_dev_operations = { #ifdef CONFIG_FUSE_IO_URING .uring_cmd = fuse_uring_cmd, #endif +#ifdef CONFIG_PROC_FS + .show_fdinfo = fuse_dev_show_fdinfo, +#endif }; EXPORT_SYMBOL_GPL(fuse_dev_operations); diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index accdce2977c5..249b210becb1 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -140,6 +140,21 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring) } } +static bool ent_list_request_expired(struct fuse_conn *fc, struct list_head *list) +{ + struct fuse_ring_ent *ent; + struct fuse_req *req; + + ent = list_first_entry_or_null(list, struct fuse_ring_ent, list); + if (!ent) + return false; + + req = ent->fuse_req; + + return time_is_before_jiffies(req->create_time + + fc->timeout.req_timeout); +} + bool fuse_uring_request_expired(struct fuse_conn *fc) { struct fuse_ring *ring = fc->ring; @@ -157,7 +172,8 @@ bool fuse_uring_request_expired(struct fuse_conn *fc) spin_lock(&queue->lock); if (fuse_request_expired(fc, &queue->fuse_req_queue) || fuse_request_expired(fc, &queue->fuse_req_bg_queue) || - fuse_fpq_processing_expired(fc, queue->fpq.processing)) { + ent_list_request_expired(fc, &queue->ent_w_req_queue) || + ent_list_request_expired(fc, &queue->ent_in_userspace)) { spin_unlock(&queue->lock); return true; } @@ -494,7 +510,7 @@ static void fuse_uring_cancel(struct io_uring_cmd *cmd, spin_lock(&queue->lock); if (ent->state == FRRS_AVAILABLE) { ent->state = FRRS_USERSPACE; - list_move(&ent->list, &queue->ent_in_userspace); + list_move_tail(&ent->list, &queue->ent_in_userspace); need_cmd_done = true; ent->cmd = NULL; } @@ -577,8 +593,8 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring, if (err) return err; - fuse_copy_init(&cs, 0, &iter); - cs.is_uring = 1; + fuse_copy_init(&cs, false, &iter); + cs.is_uring = true; cs.req = req; return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); @@ -607,8 +623,8 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, return err; } - fuse_copy_init(&cs, 1, &iter); - cs.is_uring = 1; + fuse_copy_init(&cs, true, &iter); + cs.is_uring = true; cs.req = req; if (num_args > 0) { @@ -714,7 +730,7 @@ static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent, cmd = ent->cmd; ent->cmd = NULL; ent->state = FRRS_USERSPACE; - list_move(&ent->list, &queue->ent_in_userspace); + list_move_tail(&ent->list, &queue->ent_in_userspace); spin_unlock(&queue->lock); io_uring_cmd_done(cmd, 0, 0, issue_flags); @@ -764,7 +780,7 @@ static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent, clear_bit(FR_PENDING, &req->flags); ent->fuse_req = req; ent->state = FRRS_FUSE_REQ; - list_move(&ent->list, &queue->ent_w_req_queue); + list_move_tail(&ent->list, &queue->ent_w_req_queue); fuse_uring_add_to_pq(ent, req); } @@ -1180,7 +1196,7 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, spin_lock(&queue->lock); ent->state = FRRS_USERSPACE; - list_move(&ent->list, &queue->ent_in_userspace); + list_move_tail(&ent->list, &queue->ent_in_userspace); ent->cmd = NULL; spin_unlock(&queue->lock); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 7d7ed45cb3e9..2d817d7cab26 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -200,9 +200,14 @@ static int fuse_dentry_revalidate(struct inode *dir, const struct qstr *name, { struct inode *inode; struct fuse_mount *fm; + struct fuse_conn *fc; struct fuse_inode *fi; int ret; + fc = get_fuse_conn_super(dir->i_sb); + if (entry->d_time < atomic_read(&fc->epoch)) + goto invalid; + inode = d_inode_rcu(entry); if (inode && fuse_is_bad(inode)) goto invalid; @@ -333,13 +338,6 @@ const struct dentry_operations fuse_dentry_operations = { .d_automount = fuse_dentry_automount, }; -const struct dentry_operations fuse_root_dentry_operations = { -#if BITS_PER_LONG < 64 - .d_init = fuse_dentry_init, - .d_release = fuse_dentry_release, -#endif -}; - int fuse_valid_type(int m) { return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) || @@ -412,16 +410,20 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, unsigned int flags) { - int err; struct fuse_entry_out outarg; + struct fuse_conn *fc; struct inode *inode; struct dentry *newent; + int err, epoch; bool outarg_valid = true; bool locked; if (fuse_is_bad(dir)) return ERR_PTR(-EIO); + fc = get_fuse_conn_super(dir->i_sb); + epoch = atomic_read(&fc->epoch); + locked = fuse_lock_inode(dir); err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, &outarg, &inode); @@ -443,6 +445,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, goto out_err; entry = newent ? newent : entry; + entry->d_time = epoch; if (outarg_valid) fuse_change_entry_timeout(entry, &outarg); else @@ -616,7 +619,6 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, struct file *file, unsigned int flags, umode_t mode, u32 opcode) { - int err; struct inode *inode; struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); @@ -626,11 +628,13 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, struct fuse_entry_out outentry; struct fuse_inode *fi; struct fuse_file *ff; + int epoch, err; bool trunc = flags & O_TRUNC; /* Userspace expects S_IFREG in create mode */ BUG_ON((mode & S_IFMT) != S_IFREG); + epoch = atomic_read(&fm->fc->epoch); forget = fuse_alloc_forget(); err = -ENOMEM; if (!forget) @@ -699,6 +703,7 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, } kfree(forget); d_instantiate(entry, inode); + entry->d_time = epoch; fuse_change_entry_timeout(entry, &outentry); fuse_dir_changed(dir); err = generic_file_open(inode, file); @@ -785,12 +790,14 @@ static struct dentry *create_new_entry(struct mnt_idmap *idmap, struct fuse_moun struct fuse_entry_out outarg; struct inode *inode; struct dentry *d; - int err; struct fuse_forget_link *forget; + int epoch, err; if (fuse_is_bad(dir)) return ERR_PTR(-EIO); + epoch = atomic_read(&fm->fc->epoch); + forget = fuse_alloc_forget(); if (!forget) return ERR_PTR(-ENOMEM); @@ -832,10 +839,13 @@ static struct dentry *create_new_entry(struct mnt_idmap *idmap, struct fuse_moun if (IS_ERR(d)) return d; - if (d) + if (d) { + d->d_time = epoch; fuse_change_entry_timeout(d, &outarg); - else + } else { + entry->d_time = epoch; fuse_change_entry_timeout(entry, &outarg); + } fuse_dir_changed(dir); return d; @@ -1609,10 +1619,10 @@ static int fuse_permission(struct mnt_idmap *idmap, return err; } -static int fuse_readlink_page(struct inode *inode, struct folio *folio) +static int fuse_readlink_folio(struct inode *inode, struct folio *folio) { struct fuse_mount *fm = get_fuse_mount(inode); - struct fuse_folio_desc desc = { .length = PAGE_SIZE - 1 }; + struct fuse_folio_desc desc = { .length = folio_size(folio) - 1 }; struct fuse_args_pages ap = { .num_folios = 1, .folios = &folio, @@ -1667,7 +1677,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, if (!folio) goto out_err; - err = fuse_readlink_page(inode, folio); + err = fuse_readlink_folio(inode, folio); if (err) { folio_put(folio); goto out_err; @@ -1943,6 +1953,7 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int err; bool trust_local_cmtime = is_wb; bool fault_blocked = false; + u64 attr_version; if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; @@ -2027,6 +2038,8 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (fc->handle_killpriv_v2 && !capable(CAP_FSETID)) inarg.valid |= FATTR_KILL_SUIDGID; } + + attr_version = fuse_get_attr_version(fm->fc); fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); err = fuse_simple_request(fm, &args); if (err) { @@ -2052,6 +2065,14 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, /* FIXME: clear I_DIRTY_SYNC? */ } + if (fi->attr_version > attr_version) { + /* + * Apply attributes, for example for fsnotify_change(), but set + * attribute timeout to zero. + */ + outarg.attr_valid = outarg.attr_valid_nsec = 0; + } + fuse_change_attributes_common(inode, &outarg.attr, NULL, ATTR_TIMEOUT(&outarg), fuse_get_cache_mask(inode), 0); @@ -2257,7 +2278,7 @@ void fuse_init_dir(struct inode *inode) static int fuse_symlink_read_folio(struct file *null, struct folio *folio) { - int err = fuse_readlink_page(folio->mapping->host, folio); + int err = fuse_readlink_folio(folio->mapping->host, folio); if (!err) folio_mark_uptodate(folio); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 754378dd9f71..5525a4520b0f 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -21,6 +21,7 @@ #include <linux/filelock.h> #include <linux/splice.h> #include <linux/task_io_accounting_ops.h> +#include <linux/iomap.h> static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, @@ -415,89 +416,11 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) struct fuse_writepage_args { struct fuse_io_args ia; - struct rb_node writepages_entry; struct list_head queue_entry; - struct fuse_writepage_args *next; struct inode *inode; struct fuse_sync_bucket *bucket; }; -static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, - pgoff_t idx_from, pgoff_t idx_to) -{ - struct rb_node *n; - - n = fi->writepages.rb_node; - - while (n) { - struct fuse_writepage_args *wpa; - pgoff_t curr_index; - - wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); - WARN_ON(get_fuse_inode(wpa->inode) != fi); - curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from >= curr_index + wpa->ia.ap.num_folios) - n = n->rb_right; - else if (idx_to < curr_index) - n = n->rb_left; - else - return wpa; - } - return NULL; -} - -/* - * Check if any page in a range is under writeback - */ -static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, - pgoff_t idx_to) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - bool found; - - if (RB_EMPTY_ROOT(&fi->writepages)) - return false; - - spin_lock(&fi->lock); - found = fuse_find_writeback(fi, idx_from, idx_to); - spin_unlock(&fi->lock); - - return found; -} - -static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) -{ - return fuse_range_is_writeback(inode, index, index); -} - -/* - * Wait for page writeback to be completed. - * - * Since fuse doesn't rely on the VM writeback tracking, this has to - * use some other means. - */ -static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - - wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); -} - -static inline bool fuse_folio_is_writeback(struct inode *inode, - struct folio *folio) -{ - pgoff_t last = folio_next_index(folio) - 1; - return fuse_range_is_writeback(inode, folio_index(folio), last); -} - -static void fuse_wait_on_folio_writeback(struct inode *inode, - struct folio *folio) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - - wait_event(fi->page_waitq, !fuse_folio_is_writeback(inode, folio)); -} - /* * Wait for all pending writepages on the inode to finish. * @@ -532,10 +455,6 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; - inode_lock(inode); - fuse_sync_writes(inode); - inode_unlock(inode); - err = filemap_check_errors(file->f_mapping); if (err) return err; @@ -870,12 +789,16 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, } } -static int fuse_do_readfolio(struct file *file, struct folio *folio) +static int fuse_do_readfolio(struct file *file, struct folio *folio, + size_t off, size_t len) { struct inode *inode = folio->mapping->host; struct fuse_mount *fm = get_fuse_mount(inode); - loff_t pos = folio_pos(folio); - struct fuse_folio_desc desc = { .length = PAGE_SIZE }; + loff_t pos = folio_pos(folio) + off; + struct fuse_folio_desc desc = { + .offset = off, + .length = len, + }; struct fuse_io_args ia = { .ap.args.page_zeroing = true, .ap.args.out_pages = true, @@ -886,13 +809,6 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio) ssize_t res; u64 attr_ver; - /* - * With the temporary pages that are used to complete writeback, we can - * have writeback that extends beyond the lifetime of the folio. So - * make sure we read a properly synced folio. - */ - fuse_wait_on_folio_writeback(inode, folio); - attr_ver = fuse_get_attr_version(fm->fc); /* Don't overflow end offset */ @@ -909,8 +825,6 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio) if (res < desc.length) fuse_short_read(inode, attr_ver, res, &ia.ap); - folio_mark_uptodate(folio); - return 0; } @@ -923,13 +837,26 @@ static int fuse_read_folio(struct file *file, struct folio *folio) if (fuse_is_bad(inode)) goto out; - err = fuse_do_readfolio(file, folio); + err = fuse_do_readfolio(file, folio, 0, folio_size(folio)); + if (!err) + folio_mark_uptodate(folio); + fuse_invalidate_atime(inode); out: folio_unlock(folio); return err; } +static int fuse_iomap_read_folio_range(const struct iomap_iter *iter, + struct folio *folio, loff_t pos, + size_t len) +{ + struct file *file = iter->private; + size_t off = offset_in_folio(folio, pos); + + return fuse_do_readfolio(file, folio, off, len); +} + static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, int err) { @@ -965,14 +892,13 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, fuse_io_free(ia); } -static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) +static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, + unsigned int count) { struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; struct fuse_args_pages *ap = &ia->ap; loff_t pos = folio_pos(ap->folios[0]); - /* Currently, all folios in FUSE are one page */ - size_t count = ap->num_folios << PAGE_SHIFT; ssize_t res; int err; @@ -1005,17 +931,13 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) static void fuse_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); unsigned int max_pages, nr_pages; - pgoff_t first = readahead_index(rac); - pgoff_t last = first + readahead_count(rac) - 1; + struct folio *folio = NULL; if (fuse_is_bad(inode)) return; - wait_event(fi->page_waitq, !fuse_range_is_writeback(inode, first, last)); - max_pages = min_t(unsigned int, fc->max_pages, fc->max_read / PAGE_SIZE); @@ -1033,8 +955,8 @@ static void fuse_readahead(struct readahead_control *rac) while (nr_pages) { struct fuse_io_args *ia; struct fuse_args_pages *ap; - struct folio *folio; unsigned cur_pages = min(max_pages, nr_pages); + unsigned int pages = 0; if (fc->num_background >= fc->congestion_threshold && rac->ra->async_size >= readahead_count(rac)) @@ -1046,10 +968,12 @@ static void fuse_readahead(struct readahead_control *rac) ia = fuse_io_alloc(NULL, cur_pages); if (!ia) - return; + break; ap = &ia->ap; - while (ap->num_folios < cur_pages) { + while (pages < cur_pages) { + unsigned int folio_pages; + /* * This returns a folio with a ref held on it. * The ref needs to be held until the request is @@ -1057,13 +981,31 @@ static void fuse_readahead(struct readahead_control *rac) * fuse_try_move_page()) drops the ref after it's * replaced in the page cache. */ - folio = __readahead_folio(rac); + if (!folio) + folio = __readahead_folio(rac); + + folio_pages = folio_nr_pages(folio); + if (folio_pages > cur_pages - pages) { + /* + * Large folios belonging to fuse will never + * have more pages than max_pages. + */ + WARN_ON(!pages); + break; + } + ap->folios[ap->num_folios] = folio; ap->descs[ap->num_folios].length = folio_size(folio); ap->num_folios++; + pages += folio_pages; + folio = NULL; } - fuse_send_readpages(ia, rac->file); - nr_pages -= cur_pages; + fuse_send_readpages(ia, rac->file, pages << PAGE_SHIFT); + nr_pages -= pages; + } + if (folio) { + folio_end_read(folio, false); + folio_put(folio); } } @@ -1181,7 +1123,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, int err; for (i = 0; i < ap->num_folios; i++) - fuse_wait_on_folio_writeback(inode, ap->folios[i]); + folio_wait_writeback(ap->folios[i]); fuse_write_args_fill(ia, ff, pos, count); ia->write.in.flags = fuse_write_flags(iocb); @@ -1221,32 +1163,28 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, struct address_space *mapping, struct iov_iter *ii, loff_t pos, - unsigned int max_pages) + unsigned int max_folios) { struct fuse_args_pages *ap = &ia->ap; struct fuse_conn *fc = get_fuse_conn(mapping->host); unsigned offset = pos & (PAGE_SIZE - 1); - unsigned int nr_pages = 0; size_t count = 0; - int err; + unsigned int num; + int err = 0; + + num = min(iov_iter_count(ii), fc->max_write); ap->args.in_pages = true; ap->descs[0].offset = offset; - do { + while (num && ap->num_folios < max_folios) { size_t tmp; struct folio *folio; pgoff_t index = pos >> PAGE_SHIFT; - size_t bytes = min_t(size_t, PAGE_SIZE - offset, - iov_iter_count(ii)); - - bytes = min_t(size_t, bytes, fc->max_write - count); + unsigned int bytes; + unsigned int folio_offset; again: - err = -EFAULT; - if (fault_in_iov_iter_readable(ii, bytes)) - break; - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) { @@ -1257,29 +1195,42 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, if (mapping_writably_mapped(mapping)) flush_dcache_folio(folio); - tmp = copy_folio_from_iter_atomic(folio, offset, bytes, ii); + folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset; + bytes = min(folio_size(folio) - folio_offset, num); + + tmp = copy_folio_from_iter_atomic(folio, folio_offset, bytes, ii); flush_dcache_folio(folio); if (!tmp) { folio_unlock(folio); folio_put(folio); + + /* + * Ensure forward progress by faulting in + * while not holding the folio lock: + */ + if (fault_in_iov_iter_readable(ii, bytes)) { + err = -EFAULT; + break; + } + goto again; } - err = 0; ap->folios[ap->num_folios] = folio; + ap->descs[ap->num_folios].offset = folio_offset; ap->descs[ap->num_folios].length = tmp; ap->num_folios++; - nr_pages++; count += tmp; pos += tmp; + num -= tmp; offset += tmp; - if (offset == PAGE_SIZE) + if (offset == folio_size(folio)) offset = 0; - /* If we copied full page, mark it uptodate */ - if (tmp == PAGE_SIZE) + /* If we copied full folio, mark it uptodate */ + if (tmp == folio_size(folio)) folio_mark_uptodate(folio); if (folio_test_uptodate(folio)) { @@ -1288,10 +1239,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, ia->write.folio_locked = true; break; } - if (!fc->big_writes) + if (!fc->big_writes || offset != 0) break; - } while (iov_iter_count(ii) && count < fc->max_write && - nr_pages < max_pages && offset == 0); + } return count > 0 ? count : err; } @@ -1440,6 +1390,24 @@ static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive) } } +static const struct iomap_write_ops fuse_iomap_write_ops = { + .read_folio_range = fuse_iomap_read_folio_range, +}; + +static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + iomap->type = IOMAP_MAPPED; + iomap->length = length; + iomap->offset = offset; + return 0; +} + +static const struct iomap_ops fuse_iomap_ops = { + .iomap_begin = fuse_iomap_begin, +}; + static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; @@ -1449,6 +1417,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = mapping->host; ssize_t err, count; struct fuse_conn *fc = get_fuse_conn(inode); + bool writeback = false; if (fc->writeback_cache) { /* Update size (EOF optimization) and mode (SUID clearing) */ @@ -1457,16 +1426,11 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) if (err) return err; - if (fc->handle_killpriv_v2 && - setattr_should_drop_suidgid(idmap, - file_inode(file))) { - goto writethrough; - } - - return generic_file_write_iter(iocb, from); + if (!fc->handle_killpriv_v2 || + !setattr_should_drop_suidgid(idmap, file_inode(file))) + writeback = true; } -writethrough: inode_lock(inode); err = count = generic_write_checks(iocb, from); @@ -1485,6 +1449,15 @@ writethrough: goto out; written = direct_write_fallback(iocb, from, written, fuse_perform_write(iocb, from)); + } else if (writeback) { + /* + * Use iomap so that we can do granular uptodate reads + * and granular dirty tracking for large folios. + */ + written = iomap_file_buffered_write(iocb, from, + &fuse_iomap_ops, + &fuse_iomap_write_ops, + file); } else { written = fuse_perform_write(iocb, from); } @@ -1638,7 +1611,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, return res; } } - if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { + if (!cuse && filemap_range_has_writeback(mapping, pos, (pos + count - 1))) { if (!write) inode_lock(inode); fuse_sync_writes(inode); @@ -1835,38 +1808,34 @@ static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out, static void fuse_writepage_free(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; - int i; if (wpa->bucket) fuse_sync_bucket_dec(wpa->bucket); - for (i = 0; i < ap->num_folios; i++) - folio_put(ap->folios[i]); - fuse_file_put(wpa->ia.ff, false); kfree(ap->folios); kfree(wpa); } -static void fuse_writepage_finish_stat(struct inode *inode, struct folio *folio) -{ - struct backing_dev_info *bdi = inode_to_bdi(inode); - - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - node_stat_sub_folio(folio, NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); -} - static void fuse_writepage_finish(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); + struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - for (i = 0; i < ap->num_folios; i++) - fuse_writepage_finish_stat(inode, ap->folios[i]); + for (i = 0; i < ap->num_folios; i++) { + /* + * Benchmarks showed that ending writeback within the + * scope of the fi->lock alleviates xarray lock + * contention and noticeably improves performance. + */ + iomap_finish_folio_write(inode, ap->folios[i], 1); + dec_wb_stat(&bdi->wb, WB_WRITEBACK); + wb_writeout_inc(&bdi->wb); + } wake_up(&fi->page_waitq); } @@ -1877,13 +1846,15 @@ static void fuse_send_writepage(struct fuse_mount *fm, __releases(fi->lock) __acquires(fi->lock) { - struct fuse_writepage_args *aux, *next; struct fuse_inode *fi = get_fuse_inode(wpa->inode); + struct fuse_args_pages *ap = &wpa->ia.ap; struct fuse_write_in *inarg = &wpa->ia.write.in; - struct fuse_args *args = &wpa->ia.ap.args; - /* Currently, all folios in FUSE are one page */ - __u64 data_size = wpa->ia.ap.num_folios * PAGE_SIZE; - int err; + struct fuse_args *args = &ap->args; + __u64 data_size = 0; + int err, i; + + for (i = 0; i < ap->num_folios; i++) + data_size += ap->descs[i].length; fi->writectr++; if (inarg->offset + data_size <= size) { @@ -1914,19 +1885,8 @@ __acquires(fi->lock) out_free: fi->writectr--; - rb_erase(&wpa->writepages_entry, &fi->writepages); fuse_writepage_finish(wpa); spin_unlock(&fi->lock); - - /* After rb_erase() aux request list is private */ - for (aux = wpa->next; aux; aux = next) { - next = aux->next; - aux->next = NULL; - fuse_writepage_finish_stat(aux->inode, - aux->ia.ap.folios[0]); - fuse_writepage_free(aux); - } - fuse_writepage_free(wpa); spin_lock(&fi->lock); } @@ -1954,43 +1914,6 @@ __acquires(fi->lock) } } -static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, - struct fuse_writepage_args *wpa) -{ - pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; - pgoff_t idx_to = idx_from + wpa->ia.ap.num_folios - 1; - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - - WARN_ON(!wpa->ia.ap.num_folios); - while (*p) { - struct fuse_writepage_args *curr; - pgoff_t curr_index; - - parent = *p; - curr = rb_entry(parent, struct fuse_writepage_args, - writepages_entry); - WARN_ON(curr->inode != wpa->inode); - curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; - - if (idx_from >= curr_index + curr->ia.ap.num_folios) - p = &(*p)->rb_right; - else if (idx_to < curr_index) - p = &(*p)->rb_left; - else - return curr; - } - - rb_link_node(&wpa->writepages_entry, parent, p); - rb_insert_color(&wpa->writepages_entry, root); - return NULL; -} - -static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) -{ - WARN_ON(fuse_insert_writeback(root, wpa)); -} - static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, int error) { @@ -2010,41 +1933,6 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, if (!fc->writeback_cache) fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY); spin_lock(&fi->lock); - rb_erase(&wpa->writepages_entry, &fi->writepages); - while (wpa->next) { - struct fuse_mount *fm = get_fuse_mount(inode); - struct fuse_write_in *inarg = &wpa->ia.write.in; - struct fuse_writepage_args *next = wpa->next; - - wpa->next = next->next; - next->next = NULL; - tree_insert(&fi->writepages, next); - - /* - * Skip fuse_flush_writepages() to make it easy to crop requests - * based on primary request size. - * - * 1st case (trivial): there are no concurrent activities using - * fuse_set/release_nowrite. Then we're on safe side because - * fuse_flush_writepages() would call fuse_send_writepage() - * anyway. - * - * 2nd case: someone called fuse_set_nowrite and it is waiting - * now for completion of all in-flight requests. This happens - * rarely and no more than once per page, so this should be - * okay. - * - * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle - * of fuse_set_nowrite..fuse_release_nowrite section. The fact - * that fuse_set_nowrite returned implies that all in-flight - * requests were completed along with all of their secondary - * requests. Further primary requests are blocked by negative - * writectr. Hence there cannot be any in-flight requests and - * no invocations of fuse_writepage_end() while we're in - * fuse_set_nowrite..fuse_release_nowrite section. - */ - fuse_send_writepage(fm, next, inarg->offset + inarg->size); - } fi->writectr--; fuse_writepage_finish(wpa); spin_unlock(&fi->lock); @@ -2078,17 +1966,6 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) struct fuse_file *ff; int err; - /* - * Inode is always written before the last reference is dropped and - * hence this should not be reached from reclaim. - * - * Writing back the inode from reclaim can deadlock if the request - * processing itself needs an allocation. Allocations triggering - * reclaim while serving a request can't be prevented, because it can - * involve any number of unrelated userspace processes. - */ - WARN_ON(wbc->for_reclaim); - ff = __fuse_write_file_get(fi); err = fuse_flush_times(inode, ff); if (ff) @@ -2131,22 +2008,20 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, } static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, - struct folio *tmp_folio, uint32_t folio_index) + uint32_t folio_index, loff_t offset, unsigned len) { struct inode *inode = folio->mapping->host; struct fuse_args_pages *ap = &wpa->ia.ap; - folio_copy(tmp_folio, folio); - - ap->folios[folio_index] = tmp_folio; - ap->descs[folio_index].offset = 0; - ap->descs[folio_index].length = PAGE_SIZE; + ap->folios[folio_index] = folio; + ap->descs[folio_index].offset = offset; + ap->descs[folio_index].length = len; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP); } static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, + size_t offset, struct fuse_file *ff) { struct inode *inode = folio->mapping->host; @@ -2159,7 +2034,7 @@ static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio return NULL; fuse_writepage_add_to_bucket(fc, wpa); - fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio), 0); + fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio) + offset, 0); wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; wpa->inode = inode; wpa->ia.ff = ff; @@ -2171,74 +2046,28 @@ static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio return wpa; } -static int fuse_writepage_locked(struct folio *folio) -{ - struct address_space *mapping = folio->mapping; - struct inode *inode = mapping->host; - struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_writepage_args *wpa; - struct fuse_args_pages *ap; - struct folio *tmp_folio; - struct fuse_file *ff; - int error = -ENOMEM; - - tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); - if (!tmp_folio) - goto err; - - error = -EIO; - ff = fuse_write_file_get(fi); - if (!ff) - goto err_nofile; - - wpa = fuse_writepage_args_setup(folio, ff); - error = -ENOMEM; - if (!wpa) - goto err_writepage_args; - - ap = &wpa->ia.ap; - ap->num_folios = 1; - - folio_start_writeback(folio); - fuse_writepage_args_page_fill(wpa, folio, tmp_folio, 0); - - spin_lock(&fi->lock); - tree_insert(&fi->writepages, wpa); - list_add_tail(&wpa->queue_entry, &fi->queued_writes); - fuse_flush_writepages(inode); - spin_unlock(&fi->lock); - - folio_end_writeback(folio); - - return 0; - -err_writepage_args: - fuse_file_put(ff, false); -err_nofile: - folio_put(tmp_folio); -err: - mapping_set_error(folio->mapping, error); - return error; -} - struct fuse_fill_wb_data { struct fuse_writepage_args *wpa; struct fuse_file *ff; - struct inode *inode; - struct folio **orig_folios; unsigned int max_folios; + /* + * nr_bytes won't overflow since fuse_writepage_need_send() caps + * wb requests to never exceed fc->max_pages (which has an upper bound + * of U16_MAX). + */ + unsigned int nr_bytes; }; -static bool fuse_pages_realloc(struct fuse_fill_wb_data *data) +static bool fuse_pages_realloc(struct fuse_fill_wb_data *data, + unsigned int max_pages) { struct fuse_args_pages *ap = &data->wpa->ia.ap; - struct fuse_conn *fc = get_fuse_conn(data->inode); struct folio **folios; struct fuse_folio_desc *descs; unsigned int nfolios = min_t(unsigned int, max_t(unsigned int, data->max_folios * 2, FUSE_DEFAULT_MAX_PAGES_PER_REQ), - fc->max_pages); + max_pages); WARN_ON(nfolios <= data->max_folios); folios = fuse_folios_alloc(nfolios, GFP_NOFS, &descs); @@ -2255,319 +2084,162 @@ static bool fuse_pages_realloc(struct fuse_fill_wb_data *data) return true; } -static void fuse_writepages_send(struct fuse_fill_wb_data *data) +static void fuse_writepages_send(struct inode *inode, + struct fuse_fill_wb_data *data) { struct fuse_writepage_args *wpa = data->wpa; - struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); - int num_folios = wpa->ia.ap.num_folios; - int i; spin_lock(&fi->lock); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); - - for (i = 0; i < num_folios; i++) - folio_end_writeback(data->orig_folios[i]); } -/* - * Check under fi->lock if the page is under writeback, and insert it onto the - * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's - * one already added for a page at this offset. If there's none, then insert - * this new request onto the auxiliary list, otherwise reuse the existing one by - * swapping the new temp page with the old one. - */ -static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, - struct folio *folio) -{ - struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); - struct fuse_writepage_args *tmp; - struct fuse_writepage_args *old_wpa; - struct fuse_args_pages *new_ap = &new_wpa->ia.ap; - - WARN_ON(new_ap->num_folios != 0); - new_ap->num_folios = 1; - - spin_lock(&fi->lock); - old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa); - if (!old_wpa) { - spin_unlock(&fi->lock); - return true; - } - - for (tmp = old_wpa->next; tmp; tmp = tmp->next) { - pgoff_t curr_index; - - WARN_ON(tmp->inode != new_wpa->inode); - curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT; - if (curr_index == folio->index) { - WARN_ON(tmp->ia.ap.num_folios != 1); - swap(tmp->ia.ap.folios[0], new_ap->folios[0]); - break; - } - } - - if (!tmp) { - new_wpa->next = old_wpa->next; - old_wpa->next = new_wpa; - } - - spin_unlock(&fi->lock); - - if (tmp) { - fuse_writepage_finish_stat(new_wpa->inode, - folio); - fuse_writepage_free(new_wpa); - } - - return false; -} - -static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, - struct fuse_args_pages *ap, +static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos, + unsigned len, struct fuse_args_pages *ap, struct fuse_fill_wb_data *data) { - WARN_ON(!ap->num_folios); + struct folio *prev_folio; + struct fuse_folio_desc prev_desc; + unsigned bytes = data->nr_bytes + len; + loff_t prev_pos; - /* - * Being under writeback is unlikely but possible. For example direct - * read to an mmaped fuse file will set the page dirty twice; once when - * the pages are faulted with get_user_pages(), and then after the read - * completed. - */ - if (fuse_folio_is_writeback(data->inode, folio)) - return true; + WARN_ON(!ap->num_folios); /* Reached max pages */ - if (ap->num_folios == fc->max_pages) + if ((bytes + PAGE_SIZE - 1) >> PAGE_SHIFT > fc->max_pages) return true; /* Reached max write bytes */ - if ((ap->num_folios + 1) * PAGE_SIZE > fc->max_write) + if (bytes > fc->max_write) return true; /* Discontinuity */ - if (data->orig_folios[ap->num_folios - 1]->index + 1 != folio_index(folio)) + prev_folio = ap->folios[ap->num_folios - 1]; + prev_desc = ap->descs[ap->num_folios - 1]; + prev_pos = folio_pos(prev_folio) + prev_desc.offset + prev_desc.length; + if (prev_pos != pos) return true; /* Need to grow the pages array? If so, did the expansion fail? */ - if (ap->num_folios == data->max_folios && !fuse_pages_realloc(data)) + if (ap->num_folios == data->max_folios && + !fuse_pages_realloc(data, fc->max_pages)) return true; return false; } -static int fuse_writepages_fill(struct folio *folio, - struct writeback_control *wbc, void *_data) +static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc, + struct folio *folio, u64 pos, + unsigned len, u64 end_pos) { - struct fuse_fill_wb_data *data = _data; + struct fuse_fill_wb_data *data = wpc->wb_ctx; struct fuse_writepage_args *wpa = data->wpa; struct fuse_args_pages *ap = &wpa->ia.ap; - struct inode *inode = data->inode; + struct inode *inode = wpc->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - struct folio *tmp_folio; - int err; + loff_t offset = offset_in_folio(folio, pos); + + WARN_ON_ONCE(!data); if (!data->ff) { - err = -EIO; data->ff = fuse_write_file_get(fi); if (!data->ff) - goto out_unlock; + return -EIO; } - if (wpa && fuse_writepage_need_send(fc, folio, ap, data)) { - fuse_writepages_send(data); + if (wpa && fuse_writepage_need_send(fc, pos, len, ap, data)) { + fuse_writepages_send(inode, data); data->wpa = NULL; + data->nr_bytes = 0; } - err = -ENOMEM; - tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); - if (!tmp_folio) - goto out_unlock; - - /* - * The page must not be redirtied until the writeout is completed - * (i.e. userspace has sent a reply to the write request). Otherwise - * there could be more than one temporary page instance for each real - * page. - * - * This is ensured by holding the page lock in page_mkwrite() while - * checking fuse_page_is_writeback(). We already hold the page lock - * since clear_page_dirty_for_io() and keep it held until we add the - * request to the fi->writepages list and increment ap->num_folios. - * After this fuse_page_is_writeback() will indicate that the page is - * under writeback, so we can release the page lock. - */ if (data->wpa == NULL) { - err = -ENOMEM; - wpa = fuse_writepage_args_setup(folio, data->ff); - if (!wpa) { - folio_put(tmp_folio); - goto out_unlock; - } + wpa = fuse_writepage_args_setup(folio, offset, data->ff); + if (!wpa) + return -ENOMEM; fuse_file_get(wpa->ia.ff); data->max_folios = 1; ap = &wpa->ia.ap; } - folio_start_writeback(folio); - fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_folios); - data->orig_folios[ap->num_folios] = folio; + iomap_start_folio_write(inode, folio, 1); + fuse_writepage_args_page_fill(wpa, folio, ap->num_folios, + offset, len); + data->nr_bytes += len; - err = 0; - if (data->wpa) { - /* - * Protected by fi->lock against concurrent access by - * fuse_page_is_writeback(). - */ - spin_lock(&fi->lock); - ap->num_folios++; - spin_unlock(&fi->lock); - } else if (fuse_writepage_add(wpa, folio)) { + ap->num_folios++; + if (!data->wpa) data->wpa = wpa; - } else { - folio_end_writeback(folio); + + return len; +} + +static int fuse_iomap_writeback_submit(struct iomap_writepage_ctx *wpc, + int error) +{ + struct fuse_fill_wb_data *data = wpc->wb_ctx; + + WARN_ON_ONCE(!data); + + if (data->wpa) { + WARN_ON(!data->wpa->ia.ap.num_folios); + fuse_writepages_send(wpc->inode, data); } -out_unlock: - folio_unlock(folio); - return err; + if (data->ff) + fuse_file_put(data->ff, false); + + return error; } +static const struct iomap_writeback_ops fuse_writeback_ops = { + .writeback_range = fuse_iomap_writeback_range, + .writeback_submit = fuse_iomap_writeback_submit, +}; + static int fuse_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_fill_wb_data data; - int err; + struct fuse_fill_wb_data data = {}; + struct iomap_writepage_ctx wpc = { + .inode = inode, + .iomap.type = IOMAP_MAPPED, + .wbc = wbc, + .ops = &fuse_writeback_ops, + .wb_ctx = &data, + }; - err = -EIO; if (fuse_is_bad(inode)) - goto out; + return -EIO; if (wbc->sync_mode == WB_SYNC_NONE && fc->num_background >= fc->congestion_threshold) return 0; - data.inode = inode; - data.wpa = NULL; - data.ff = NULL; - - err = -ENOMEM; - data.orig_folios = kcalloc(fc->max_pages, - sizeof(struct folio *), - GFP_NOFS); - if (!data.orig_folios) - goto out; - - err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); - if (data.wpa) { - WARN_ON(!data.wpa->ia.ap.num_folios); - fuse_writepages_send(&data); - } - if (data.ff) - fuse_file_put(data.ff, false); - - kfree(data.orig_folios); -out: - return err; -} - -/* - * It's worthy to make sure that space is reserved on disk for the write, - * but how to implement it without killing performance need more thinking. - */ -static int fuse_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct folio **foliop, void **fsdata) -{ - pgoff_t index = pos >> PAGE_SHIFT; - struct fuse_conn *fc = get_fuse_conn(file_inode(file)); - struct folio *folio; - loff_t fsize; - int err = -ENOMEM; - - WARN_ON(!fc->writeback_cache); - - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); - if (IS_ERR(folio)) - goto error; - - fuse_wait_on_page_writeback(mapping->host, folio->index); - - if (folio_test_uptodate(folio) || len >= folio_size(folio)) - goto success; - /* - * Check if the start of this folio comes after the end of file, - * in which case the readpage can be optimized away. - */ - fsize = i_size_read(mapping->host); - if (fsize <= folio_pos(folio)) { - size_t off = offset_in_folio(folio, pos); - if (off) - folio_zero_segment(folio, 0, off); - goto success; - } - err = fuse_do_readfolio(file, folio); - if (err) - goto cleanup; -success: - *foliop = folio; - return 0; - -cleanup: - folio_unlock(folio); - folio_put(folio); -error: - return err; -} - -static int fuse_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) -{ - struct inode *inode = folio->mapping->host; - - /* Haven't copied anything? Skip zeroing, size extending, dirtying. */ - if (!copied) - goto unlock; - - pos += copied; - if (!folio_test_uptodate(folio)) { - /* Zero any unwritten bytes at the end of the page */ - size_t endoff = pos & ~PAGE_MASK; - if (endoff) - folio_zero_segment(folio, endoff, PAGE_SIZE); - folio_mark_uptodate(folio); - } - - if (pos > inode->i_size) - i_size_write(inode, pos); - - folio_mark_dirty(folio); - -unlock: - folio_unlock(folio); - folio_put(folio); - - return copied; + return iomap_writepages(&wpc); } static int fuse_launder_folio(struct folio *folio) { int err = 0; - if (folio_clear_dirty_for_io(folio)) { - struct inode *inode = folio->mapping->host; + struct fuse_fill_wb_data data = {}; + struct iomap_writepage_ctx wpc = { + .inode = folio->mapping->host, + .iomap.type = IOMAP_MAPPED, + .ops = &fuse_writeback_ops, + .wb_ctx = &data, + }; - /* Serialize with pending writeback for the same page */ - fuse_wait_on_page_writeback(inode, folio->index); - err = fuse_writepage_locked(folio); + if (folio_clear_dirty_for_io(folio)) { + err = iomap_writeback_folio(&wpc, folio); + err = fuse_iomap_writeback_submit(&wpc, err); if (!err) - fuse_wait_on_page_writeback(inode, folio->index); + folio_wait_writeback(folio); } return err; } @@ -2611,7 +2283,7 @@ static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) return VM_FAULT_NOPAGE; } - fuse_wait_on_folio_writeback(inode, folio); + folio_wait_writeback(folio); return VM_FAULT_LOCKED; } @@ -3418,20 +3090,24 @@ static const struct address_space_operations fuse_file_aops = { .readahead = fuse_readahead, .writepages = fuse_writepages, .launder_folio = fuse_launder_folio, - .dirty_folio = filemap_dirty_folio, + .dirty_folio = iomap_dirty_folio, + .release_folio = iomap_release_folio, + .invalidate_folio = iomap_invalidate_folio, + .is_partially_uptodate = iomap_is_partially_uptodate, .migrate_folio = filemap_migrate_folio, .bmap = fuse_bmap, .direct_IO = fuse_direct_IO, - .write_begin = fuse_write_begin, - .write_end = fuse_write_end, }; void fuse_init_file_inode(struct inode *inode, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); inode->i_fop = &fuse_file_operations; inode->i_data.a_ops = &fuse_file_aops; + if (fc->writeback_cache) + mapping_set_writeback_may_deadlock_on_reclaim(&inode->i_data); INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); @@ -3439,7 +3115,6 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); init_waitqueue_head(&fi->direct_io_waitq); - fi->writepages = RB_ROOT; if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_inode_init(inode, flags); diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index b3c2e32254ba..5a9bd771a319 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -20,7 +20,6 @@ struct fuse_iqueue; struct fuse_forget_link; struct fuse_copy_state { - int write; struct fuse_req *req; struct iov_iter *iter; struct pipe_buffer *pipebufs; @@ -30,8 +29,9 @@ struct fuse_copy_state { struct page *pg; unsigned int len; unsigned int offset; - unsigned int move_pages:1; - unsigned int is_uring:1; + bool write:1; + bool move_folios:1; + bool is_uring:1; struct { unsigned int copied_sz; /* copied size into the user buffer */ } ring; @@ -51,7 +51,7 @@ struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique); void fuse_dev_end_requests(struct list_head *head); -void fuse_copy_init(struct fuse_copy_state *cs, int write, +void fuse_copy_init(struct fuse_copy_state *cs, bool write, struct iov_iter *iter); int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs, unsigned int argpages, struct fuse_arg *args, @@ -64,7 +64,6 @@ void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req); bool fuse_remove_pending_req(struct fuse_req *req, spinlock_t *lock); bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list); -bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing); #endif diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d56d4fd956db..ec248d13c8bf 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -74,8 +74,8 @@ extern struct list_head fuse_conn_list; extern struct mutex fuse_mutex; /** Module parameters */ -extern unsigned max_user_bgreq; -extern unsigned max_user_congthresh; +extern unsigned int max_user_bgreq; +extern unsigned int max_user_congthresh; /* One forget request */ struct fuse_forget_link { @@ -161,9 +161,6 @@ struct fuse_inode { /* waitq for direct-io completion */ wait_queue_head_t direct_io_waitq; - - /* List of writepage requestst (pending or sent) */ - struct rb_root writepages; }; /* readdir cache (directory only) */ @@ -636,6 +633,9 @@ struct fuse_conn { /** Number of fuse_dev's */ atomic_t dev_count; + /** Current epoch for up-to-date dentries */ + atomic_t epoch; + struct rcu_head rcu; /** The user id for this mount */ @@ -913,12 +913,6 @@ struct fuse_conn { /** Device ID from the root super block */ dev_t dev; - /** Dentries in the control filesystem */ - struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES]; - - /** number of dentries used in the above array */ - int ctl_ndents; - /** Key for lock owner ID scrambling */ u32 scramble_key[4]; @@ -1109,7 +1103,6 @@ static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket) extern const struct file_operations fuse_dev_operations; extern const struct dentry_operations fuse_dentry_operations; -extern const struct dentry_operations fuse_root_dentry_operations; /** * Get a filled in inode @@ -1486,9 +1479,9 @@ void fuse_dax_cancel_work(struct fuse_conn *fc); long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int fuse_fileattr_get(struct dentry *dentry, struct file_kattr *fa); int fuse_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); /* iomode.c */ int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index fd48e8d37f2e..ecb869e895ab 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -9,6 +9,7 @@ #include "fuse_i.h" #include "dev_uring_i.h" +#include <linux/dax.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/file.h> @@ -41,7 +42,7 @@ unsigned int fuse_max_pages_limit = 256; unsigned int fuse_default_req_timeout; unsigned int fuse_max_req_timeout; -unsigned max_user_bgreq; +unsigned int max_user_bgreq; module_param_call(max_user_bgreq, set_global_limit, param_get_uint, &max_user_bgreq, 0644); __MODULE_PARM_TYPE(max_user_bgreq, "uint"); @@ -49,7 +50,7 @@ MODULE_PARM_DESC(max_user_bgreq, "Global limit for the maximum number of backgrounded requests an " "unprivileged user can set"); -unsigned max_user_congthresh; +unsigned int max_user_congthresh; module_param_call(max_user_congthresh, set_global_limit, param_get_uint, &max_user_congthresh, 0644); __MODULE_PARM_TYPE(max_user_congthresh, "uint"); @@ -162,6 +163,9 @@ static void fuse_evict_inode(struct inode *inode) /* Will write inode on close/munmap and in all other dirtiers */ WARN_ON(inode->i_state & I_DIRTY_INODE); + if (FUSE_IS_DAX(inode)) + dax_break_layout_final(inode); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (inode->i_sb->s_flags & SB_ACTIVE) { @@ -962,6 +966,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, init_rwsem(&fc->killsb); refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); + atomic_set(&fc->epoch, 1); init_waitqueue_head(&fc->blocked_waitq); fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); INIT_LIST_HEAD(&fc->bg_queue); @@ -1036,7 +1041,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) } EXPORT_SYMBOL_GPL(fuse_conn_get); -static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) +static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned int mode) { struct fuse_attr attr; memset(&attr, 0, sizeof(attr)); @@ -1211,7 +1216,7 @@ static const struct super_operations fuse_super_operations = { .show_options = fuse_show_options, }; -static void sanitize_global_limit(unsigned *limit) +static void sanitize_global_limit(unsigned int *limit) { /* * The default maximum number of async requests is calculated to consume @@ -1232,7 +1237,7 @@ static int set_global_limit(const char *val, const struct kernel_param *kp) if (rv) return rv; - sanitize_global_limit((unsigned *)kp->arg); + sanitize_global_limit((unsigned int *)kp->arg); return 0; } @@ -1714,7 +1719,7 @@ static int fuse_fill_super_submount(struct super_block *sb, fi = get_fuse_inode(root); fi->nlookup--; - sb->s_d_op = &fuse_dentry_operations; + set_default_d_op(sb, &fuse_dentry_operations); sb->s_root = d_make_root(root); if (!sb->s_root) return -ENOMEM; @@ -1849,12 +1854,10 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) err = -ENOMEM; root = fuse_get_root_inode(sb, ctx->rootmode); - sb->s_d_op = &fuse_root_dentry_operations; + set_default_d_op(sb, &fuse_dentry_operations); root_dentry = d_make_root(root); if (!root_dentry) goto err_dev_free; - /* Root dentry doesn't have .d_revalidate */ - sb->s_d_op = &fuse_dentry_operations; mutex_lock(&fuse_mutex); err = -EINVAL; diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 2d9abf48828f..57032eadca6c 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -502,7 +502,7 @@ static void fuse_priv_ioctl_cleanup(struct inode *inode, struct fuse_file *ff) fuse_file_release(inode, ff, O_RDONLY, NULL, S_ISDIR(inode->i_mode)); } -int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int fuse_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct fuse_file *ff; @@ -536,11 +536,13 @@ int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa) cleanup: fuse_priv_ioctl_cleanup(inode, ff); + if (err == -ENOTTY) + err = -EOPNOTSUPP; return err; } int fuse_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct fuse_file *ff; @@ -572,5 +574,7 @@ int fuse_fileattr_set(struct mnt_idmap *idmap, cleanup: fuse_priv_ioctl_cleanup(inode, ff); + if (err == -ENOTTY) + err = -EOPNOTSUPP; return err; } diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index edcd6f18a8a8..c2aae2eef086 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -161,6 +161,7 @@ static int fuse_direntplus_link(struct file *file, struct fuse_conn *fc; struct inode *inode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + int epoch; if (!o->nodeid) { /* @@ -190,6 +191,7 @@ static int fuse_direntplus_link(struct file *file, return -EIO; fc = get_fuse_conn(dir); + epoch = atomic_read(&fc->epoch); name.hash = full_name_hash(parent, name.name, name.len); dentry = d_lookup(parent, &name); @@ -256,6 +258,7 @@ retry: } if (fc->readdirplus_auto) set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); + dentry->d_time = epoch; fuse_change_entry_timeout(dentry, o); dput(dentry); @@ -332,35 +335,32 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) { int plus; ssize_t res; - struct folio *folio; struct inode *inode = file_inode(file); struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_conn *fc = fm->fc; struct fuse_io_args ia = {}; - struct fuse_args_pages *ap = &ia.ap; - struct fuse_folio_desc desc = { .length = PAGE_SIZE }; + struct fuse_args *args = &ia.ap.args; + void *buf; + size_t bufsize = clamp((unsigned int) ctx->count, PAGE_SIZE, fc->max_pages << PAGE_SHIFT); u64 attr_version = 0, evict_ctr = 0; bool locked; - folio = folio_alloc(GFP_KERNEL, 0); - if (!folio) + buf = kvmalloc(bufsize, GFP_KERNEL); + if (!buf) return -ENOMEM; + args->out_args[0].value = buf; + plus = fuse_use_readdirplus(inode, ctx); - ap->args.out_pages = true; - ap->num_folios = 1; - ap->folios = &folio; - ap->descs = &desc; if (plus) { attr_version = fuse_get_attr_version(fm->fc); evict_ctr = fuse_get_evict_ctr(fm->fc); - fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, - FUSE_READDIRPLUS); + fuse_read_args_fill(&ia, file, ctx->pos, bufsize, FUSE_READDIRPLUS); } else { - fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, - FUSE_READDIR); + fuse_read_args_fill(&ia, file, ctx->pos, bufsize, FUSE_READDIR); } locked = fuse_lock_inode(inode); - res = fuse_simple_request(fm, &ap->args); + res = fuse_simple_request(fm, args); fuse_unlock_inode(inode, locked); if (res >= 0) { if (!res) { @@ -369,16 +369,14 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) if (ff->open_flags & FOPEN_CACHE_DIR) fuse_readdir_cache_end(file, ctx->pos); } else if (plus) { - res = parse_dirplusfile(folio_address(folio), res, - file, ctx, attr_version, + res = parse_dirplusfile(buf, res, file, ctx, attr_version, evict_ctr); } else { - res = parse_dirfile(folio_address(folio), res, file, - ctx); + res = parse_dirfile(buf, res, file, ctx); } } - folio_put(folio); + kvfree(buf); fuse_invalidate_atime(inode); return res; } diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 53c2626e90e7..c826e7ca49f5 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -9,7 +9,6 @@ #include <linux/pci.h> #include <linux/interrupt.h> #include <linux/group_cpus.h> -#include <linux/pfn_t.h> #include <linux/memremap.h> #include <linux/module.h> #include <linux/virtio.h> @@ -862,7 +861,7 @@ static void virtio_fs_requests_done_work(struct work_struct *work) static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *fs) { const struct cpumask *mask, *masks; - unsigned int q, cpu; + unsigned int q, cpu, nr_masks; /* First attempt to map using existing transport layer affinities * e.g. PCIe MSI-X @@ -882,7 +881,7 @@ static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *f return; fallback: /* Attempt to map evenly in groups over the CPUs */ - masks = group_cpus_evenly(fs->num_request_queues); + masks = group_cpus_evenly(fs->num_request_queues, &nr_masks); /* If even this fails we default to all CPUs use first request queue */ if (!masks) { for_each_possible_cpu(cpu) @@ -891,7 +890,7 @@ fallback: } for (q = 0; q < fs->num_request_queues; q++) { - for_each_cpu(cpu, &masks[q]) + for_each_cpu(cpu, &masks[q % nr_masks]) fs->mq_map[cpu] = q + VQ_REQUEST; } kfree(masks); @@ -1008,7 +1007,7 @@ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev) */ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, - void **kaddr, pfn_t *pfn) + void **kaddr, unsigned long *pfn) { struct virtio_fs *fs = dax_get_private(dax_dev); phys_addr_t offset = PFN_PHYS(pgoff); @@ -1017,7 +1016,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, if (kaddr) *kaddr = fs->window_kaddr + offset; if (pfn) - *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, 0); + *pfn = fs->window_phys_addr + offset; return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; } diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 14f204cd5a82..47d74afd63ac 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -159,7 +159,11 @@ static int gfs2_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); - struct iomap_writepage_ctx wpc = { }; + struct iomap_writepage_ctx wpc = { + .inode = mapping->host, + .wbc = wbc, + .ops = &gfs2_writeback_ops, + }; int ret; /* @@ -168,7 +172,7 @@ static int gfs2_writepages(struct address_space *mapping, * want balance_dirty_pages() to loop indefinitely trying to write out * pages held in the ail that it can't find. */ - ret = iomap_writepages(mapping, wbc, &wpc, &gfs2_writeback_ops); + ret = iomap_writepages(&wpc); if (ret == 0 && wbc->nr_to_write > 0) set_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags); return ret; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 7703d0471139..131091520de6 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -963,12 +963,16 @@ static struct folio * gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len) { struct inode *inode = iter->inode; + struct gfs2_inode *ip = GFS2_I(inode); unsigned int blockmask = i_blocksize(inode) - 1; struct gfs2_sbd *sdp = GFS2_SB(inode); unsigned int blocks; struct folio *folio; int status; + if (!gfs2_is_jdata(ip) && !gfs2_is_stuffed(ip)) + return iomap_get_folio(iter, pos, len); + blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits; status = gfs2_trans_begin(sdp, RES_DINODE + blocks, 0); if (status) @@ -987,7 +991,7 @@ static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos, struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - if (!gfs2_is_stuffed(ip)) + if (gfs2_is_jdata(ip) && !gfs2_is_stuffed(ip)) gfs2_trans_add_databufs(ip->i_gl, folio, offset_in_folio(folio, pos), copied); @@ -995,13 +999,14 @@ static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos, folio_unlock(folio); folio_put(folio); - if (tr->tr_num_buf_new) - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); - - gfs2_trans_end(sdp); + if (gfs2_is_jdata(ip) || gfs2_is_stuffed(ip)) { + if (tr->tr_num_buf_new) + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + gfs2_trans_end(sdp); + } } -static const struct iomap_folio_ops gfs2_iomap_folio_ops = { +const struct iomap_write_ops gfs2_iomap_write_ops = { .get_folio = gfs2_iomap_get_folio, .put_folio = gfs2_iomap_put_folio, }; @@ -1078,8 +1083,6 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, gfs2_trans_end(sdp); } - if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip)) - iomap->folio_ops = &gfs2_iomap_folio_ops; return 0; out_trans_end: @@ -1304,7 +1307,7 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from, loff_t length return 0; length = min(length, inode->i_size - from); return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops, - NULL); + &gfs2_iomap_write_ops, NULL); } #define GFS2_JTRUNC_REVOKES 8192 @@ -2469,23 +2472,26 @@ out: return error; } -static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode, - loff_t offset, unsigned int len) +static ssize_t gfs2_writeback_range(struct iomap_writepage_ctx *wpc, + struct folio *folio, u64 offset, unsigned int len, u64 end_pos) { - int ret; - - if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode)))) + if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(wpc->inode)))) return -EIO; - if (offset >= wpc->iomap.offset && - offset < wpc->iomap.offset + wpc->iomap.length) - return 0; + if (offset < wpc->iomap.offset || + offset >= wpc->iomap.offset + wpc->iomap.length) { + int ret; - memset(&wpc->iomap, 0, sizeof(wpc->iomap)); - ret = gfs2_iomap_get(inode, offset, INT_MAX, &wpc->iomap); - return ret; + memset(&wpc->iomap, 0, sizeof(wpc->iomap)); + ret = gfs2_iomap_get(wpc->inode, offset, INT_MAX, &wpc->iomap); + if (ret) + return ret; + } + + return iomap_add_to_ioend(wpc, folio, offset, end_pos, len); } const struct iomap_writeback_ops gfs2_writeback_ops = { - .map_blocks = gfs2_map_blocks, + .writeback_range = gfs2_writeback_range, + .writeback_submit = iomap_ioend_writeback_submit, }; diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index 4e8b1e8ebdf3..6cdc72dd55a3 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -44,6 +44,7 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip, } extern const struct iomap_ops gfs2_iomap_ops; +extern const struct iomap_write_ops gfs2_iomap_write_ops; extern const struct iomap_writeback_ops gfs2_writeback_ops; int gfs2_unstuff_dinode(struct gfs2_inode *ip); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index dbf1aede744c..509e2f0d97e7 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -60,6 +60,7 @@ #include <linux/crc32.h> #include <linux/vmalloc.h> #include <linux/bio.h> +#include <linux/log2.h> #include "gfs2.h" #include "incore.h" @@ -912,7 +913,6 @@ static int dir_make_exhash(struct inode *inode) struct qstr args; struct buffer_head *bh, *dibh; struct gfs2_leaf *leaf; - int y; u32 x; __be64 *lp; u64 bn; @@ -979,9 +979,7 @@ static int dir_make_exhash(struct inode *inode) i_size_write(inode, sdp->sd_sb.sb_bsize / 2); gfs2_add_inode_blocks(&dip->i_inode, 1); dip->i_diskflags |= GFS2_DIF_EXHASH; - - for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ; - dip->i_depth = y; + dip->i_depth = ilog2(sdp->sd_hash_ptrs); gfs2_dinode_out(dip, dibh->b_data); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index fd1147aa3891..72d95185a39f 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -155,7 +155,7 @@ static inline u32 gfs2_gfsflags_to_fsflags(struct inode *inode, u32 gfsflags) return fsflags; } -int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int gfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); @@ -276,7 +276,7 @@ out: } int gfs2_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); u32 fsflags = fa->flags, gfsflags = 0; @@ -1058,7 +1058,8 @@ retry: } pagefault_disable(); - ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops, NULL); + ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops, + &gfs2_iomap_write_ops, NULL); pagefault_enable(); if (ret > 0) written += ret; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index ba25b884169e..b6fd1cb17de7 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -590,35 +590,31 @@ static void gfs2_demote_wake(struct gfs2_glock *gl) static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) { const struct gfs2_glock_operations *glops = gl->gl_ops; - struct gfs2_holder *gh; - unsigned state = ret & LM_OUT_ST_MASK; - trace_gfs2_glock_state_change(gl, state); - state_change(gl, state); - gh = find_first_waiter(gl); + if (!(ret & ~LM_OUT_ST_MASK)) { + unsigned state = ret & LM_OUT_ST_MASK; + + trace_gfs2_glock_state_change(gl, state); + state_change(gl, state); + } + /* Demote to UN request arrived during demote to SH or DF */ if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) && - state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED) + gl->gl_state != LM_ST_UNLOCKED && + gl->gl_demote_state == LM_ST_UNLOCKED) gl->gl_target = LM_ST_UNLOCKED; /* Check for state != intended state */ - if (unlikely(state != gl->gl_target)) { - if (gh && (ret & LM_OUT_CANCELED)) - gfs2_holder_wake(gh); + if (unlikely(gl->gl_state != gl->gl_target)) { + struct gfs2_holder *gh = find_first_waiter(gl); + if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) { if (ret & LM_OUT_CANCELED) { list_del_init(&gh->gh_list); trace_gfs2_glock_queue(gh, 0); + gfs2_holder_wake(gh); gl->gl_target = gl->gl_state; - gh = find_first_waiter(gl); - if (gh) { - gl->gl_target = gh->gh_state; - if (do_promote(gl)) - goto out; - do_xmote(gl, gh, gl->gl_target); - return; - } goto out; } /* Some error or failed "try lock" - report it */ @@ -629,7 +625,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) goto out; } } - switch(state) { + switch(gl->gl_state) { /* Unlocked due to conversion deadlock, try again */ case LM_ST_UNLOCKED: do_xmote(gl, gh, gl->gl_target); @@ -640,8 +636,10 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) do_xmote(gl, gh, LM_ST_UNLOCKED); break; default: /* Everything else */ - fs_err(gl->gl_name.ln_sbd, "wanted %u got %u\n", - gl->gl_target, state); + fs_err(gl->gl_name.ln_sbd, + "glock %u:%llu requested=%u ret=%u\n", + gl->gl_name.ln_type, gl->gl_name.ln_number, + gl->gl_req, ret); GLOCK_BUG_ON(gl, 1); } return; @@ -650,7 +648,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) /* Fast path - we got what we asked for */ if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) gfs2_demote_wake(gl); - if (state != LM_ST_UNLOCKED) { + if (gl->gl_state != LM_ST_UNLOCKED) { if (glops->go_xmote_bh) { int rv; @@ -802,7 +800,8 @@ skip_inval: * We skip telling dlm to do the locking, so we won't get a * reply that would otherwise clear GLF_LOCK. So we clear it here. */ - clear_bit(GLF_LOCK, &gl->gl_flags); + if (!test_bit(GLF_CANCELING, &gl->gl_flags)) + clear_bit(GLF_LOCK, &gl->gl_flags); clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD); return; diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index c171f745650f..9339a3bff6ee 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -92,12 +92,22 @@ enum { * LM_OUT_ST_MASK * Masks the lower two bits of lock state in the returned value. * + * LM_OUT_TRY_AGAIN + * The trylock request failed. + * + * LM_OUT_DEADLOCK + * The lock request failed because it would deadlock. + * * LM_OUT_CANCELED * The lock request was canceled. * + * LM_OUT_ERROR + * The lock request timed out or failed. */ #define LM_OUT_ST_MASK 0x00000003 +#define LM_OUT_TRY_AGAIN 0x00000020 +#define LM_OUT_DEADLOCK 0x00000010 #define LM_OUT_CANCELED 0x00000008 #define LM_OUT_ERROR 0x00000004 diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index cebd66b22694..fe0faad4892f 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -11,6 +11,7 @@ #include <linux/bio.h> #include <linux/posix_acl.h> #include <linux/security.h> +#include <linux/log2.h> #include "gfs2.h" #include "incore.h" @@ -450,6 +451,11 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) gfs2_consist_inode(ip); return -EIO; } + if ((ip->i_diskflags & GFS2_DIF_EXHASH) && + depth < ilog2(sdp->sd_hash_ptrs)) { + gfs2_consist_inode(ip); + return -EIO; + } ip->i_depth = (u8)depth; ip->i_entries = be32_to_cpu(str->di_entries); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 0a41c4e76b32..d4ad82f47eee 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -375,7 +375,6 @@ struct gfs2_glock { enum { GIF_QD_LOCKED = 1, - GIF_ALLOC_FAILED = 2, GIF_SW_PAGED = 3, GIF_FREE_VFS_INODE = 5, GIF_GLOP_PENDING = 6, diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 187d789a8f1e..8760e7e20c9d 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -444,11 +444,9 @@ static void gfs2_final_release_pages(struct gfs2_inode *ip) struct inode *inode = &ip->i_inode; struct gfs2_glock *gl = ip->i_gl; - if (unlikely(!gl)) { - /* This can only happen during incomplete inode creation. */ - BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)); + /* This can only happen during incomplete inode creation. */ + if (unlikely(!gl)) return; - } truncate_inode_pages(gfs2_glock2aspace(gl), 0); truncate_inode_pages(&inode->i_data, 0); @@ -902,7 +900,6 @@ fail_gunlock3: fail_gunlock2: gfs2_glock_put(io_gl); fail_dealloc_inode: - set_bit(GIF_ALLOC_FAILED, &ip->i_flags); dealloc_error = 0; if (ip->i_eattr) dealloc_error = gfs2_ea_dealloc(ip, xattr_initialized); diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index eafe123617e6..e43f08eb26e7 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -44,17 +44,17 @@ static inline int gfs2_is_dir(const struct gfs2_inode *ip) static inline void gfs2_set_inode_blocks(struct inode *inode, u64 blocks) { - inode->i_blocks = blocks << (inode->i_blkbits - 9); + inode->i_blocks = blocks << (inode->i_blkbits - SECTOR_SHIFT); } static inline u64 gfs2_get_inode_blocks(const struct inode *inode) { - return inode->i_blocks >> (inode->i_blkbits - 9); + return inode->i_blocks >> (inode->i_blkbits - SECTOR_SHIFT); } static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change) { - change <<= inode->i_blkbits - 9; + change <<= inode->i_blkbits - SECTOR_SHIFT; gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks >= -change)); inode->i_blocks += change; } @@ -107,9 +107,9 @@ loff_t gfs2_seek_hole(struct file *file, loff_t offset); extern const struct file_operations gfs2_file_fops_nolock; extern const struct file_operations gfs2_dir_fops_nolock; -int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int gfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa); int gfs2_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); void gfs2_set_inode_flags(struct inode *inode); #ifdef CONFIG_GFS2_FS_LOCKING_DLM diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 7cb9d216d8bb..cee5d199d2d8 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -119,7 +119,7 @@ static inline void gfs2_update_request_times(struct gfs2_glock *gl) static void gdlm_ast(void *arg) { struct gfs2_glock *gl = arg; - unsigned ret = gl->gl_state; + unsigned ret; /* If the glock is dead, we only react to a dlm_unlock() reply. */ if (__lockref_is_dead(&gl->gl_lockref) && @@ -139,13 +139,16 @@ static void gdlm_ast(void *arg) gfs2_glock_free(gl); return; case -DLM_ECANCEL: /* Cancel while getting lock */ - ret |= LM_OUT_CANCELED; + ret = LM_OUT_CANCELED; goto out; case -EAGAIN: /* Try lock fails */ + ret = LM_OUT_TRY_AGAIN; + goto out; case -EDEADLK: /* Deadlock detected */ + ret = LM_OUT_DEADLOCK; goto out; case -ETIMEDOUT: /* Canceled due to timeout */ - ret |= LM_OUT_ERROR; + ret = LM_OUT_ERROR; goto out; case 0: /* Success */ break; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 9dc8885c95d0..7fb11ff71b5a 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -103,6 +103,7 @@ const struct address_space_operations gfs2_meta_aops = { .invalidate_folio = block_invalidate_folio, .writepages = gfs2_aspace_writepages, .release_folio = gfs2_release_folio, + .migrate_folio = buffer_migrate_folio_norefs, }; const struct address_space_operations gfs2_rgrp_aops = { @@ -110,6 +111,7 @@ const struct address_space_operations gfs2_rgrp_aops = { .invalidate_folio = block_invalidate_folio, .writepages = gfs2_aspace_writepages, .release_folio = gfs2_release_folio, + .migrate_folio = buffer_migrate_folio_norefs, }; /** @@ -228,7 +230,7 @@ static void gfs2_submit_bhs(blk_opf_t opf, struct buffer_head *bhs[], int num) struct bio *bio; bio = bio_alloc(bh->b_bdev, num, opf, GFP_NOIO); - bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> SECTOR_SHIFT); while (num > 0) { bh = *bhs; if (!bio_add_folio(bio, bh->b_folio, bh->b_size, bh_offset(bh))) { @@ -443,11 +445,9 @@ void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) struct buffer_head *bh; int ty; - if (!ip->i_gl) { - /* This can only happen during incomplete inode creation. */ - BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)); + /* This can only happen during incomplete inode creation. */ + if (!ip->i_gl) return; - } gfs2_ail1_wipe(sdp, bstart, blen); while (blen) { diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 653f0ff4b057..efe99b732551 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -64,7 +64,10 @@ static void gfs2_tune_init(struct gfs2_tune *gt) void free_sbd(struct gfs2_sbd *sdp) { + struct super_block *sb = sdp->sd_vfs; + free_percpu(sdp->sd_lkstats); + sb->s_fs_info = NULL; kfree(sdp); } @@ -160,7 +163,7 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) return -EINVAL; } - if (sb->sb_bsize < 512 || sb->sb_bsize > PAGE_SIZE || + if (sb->sb_bsize < SECTOR_SIZE || sb->sb_bsize > PAGE_SIZE || (sb->sb_bsize & (sb->sb_bsize - 1))) { pr_warn("Invalid block size\n"); return -EINVAL; @@ -221,8 +224,8 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) if (unlikely(!sb)) return -ENOMEM; err = bdev_rw_virt(sdp->sd_vfs->s_bdev, - sector * (sdp->sd_vfs->s_blocksize >> 9), sb, PAGE_SIZE, - REQ_OP_READ | REQ_META); + sector << (sdp->sd_vfs->s_blocksize_bits - SECTOR_SHIFT), + sb, PAGE_SIZE, REQ_OP_READ | REQ_META); if (err) { pr_warn("error %d reading superblock\n", err); kfree(sb); @@ -254,7 +257,7 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) return error; } - sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - 9; + sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - SECTOR_SHIFT; sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift); sdp->sd_diptrs = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) / sizeof(u64); @@ -1142,7 +1145,7 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_magic = GFS2_MAGIC; sb->s_op = &gfs2_super_ops; - sb->s_d_op = &gfs2_dops; + set_default_d_op(sb, &gfs2_dops); sb->s_export_op = &gfs2_export_ops; sb->s_qcop = &gfs2_quotactl_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; @@ -1152,12 +1155,12 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) /* Set up the buffer cache and fill in some fake block size values to allow us to read-in the on-disk superblock. */ - sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, 512); + sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, SECTOR_SIZE); error = -EINVAL; if (!sdp->sd_sb.sb_bsize) goto fail_free; sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits; - sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - 9; + sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - SECTOR_SHIFT; sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift); sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit; @@ -1314,7 +1317,6 @@ fail_iput: iput(sdp->sd_inode); fail_free: free_sbd(sdp); - sb->s_fs_info = NULL; return error; } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 7c518c4ff638..b42e2110084b 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -487,11 +487,9 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) int need_endtrans = 0; int ret; - if (unlikely(!ip->i_gl)) { - /* This can only happen during incomplete inode creation. */ - BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)); + /* This can only happen during incomplete inode creation. */ + if (unlikely(!ip->i_gl)) return; - } if (gfs2_withdrawing_or_withdrawn(sdp)) return; diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 748125653d6c..c3c8842920d2 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -764,7 +764,6 @@ fail_reg: fs_err(sdp, "error %d adding sysfs files\n", error); kobject_put(&sdp->sd_kobj); wait_for_completion(&sdp->sd_kobj_unregister); - sb->s_fs_info = NULL; return error; } diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index d5a1e63fa257..24864a66074b 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -232,32 +232,23 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) */ ret = gfs2_glock_nq(&sdp->sd_live_gh); + gfs2_glock_put(live_gl); /* drop extra reference we acquired */ + clear_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags); + /* * If we actually got the "live" lock in EX mode, there are no other - * nodes available to replay our journal. So we try to replay it - * ourselves. We hold the "live" glock to prevent other mounters - * during recovery, then just dequeue it and reacquire it in our - * normal SH mode. Just in case the problem that caused us to - * withdraw prevents us from recovering our journal (e.g. io errors - * and such) we still check if the journal is clean before proceeding - * but we may wait forever until another mounter does the recovery. + * nodes available to replay our journal. */ if (ret == 0) { - fs_warn(sdp, "No other mounters found. Trying to recover our " - "own journal jid %d.\n", sdp->sd_lockstruct.ls_jid); - if (gfs2_recover_journal(sdp->sd_jdesc, 1)) - fs_warn(sdp, "Unable to recover our journal jid %d.\n", - sdp->sd_lockstruct.ls_jid); - gfs2_glock_dq_wait(&sdp->sd_live_gh); - gfs2_holder_reinit(LM_ST_SHARED, - LM_FLAG_NOEXP | GL_EXACT | GL_NOPID, - &sdp->sd_live_gh); - gfs2_glock_nq(&sdp->sd_live_gh); + fs_warn(sdp, "No other mounters found.\n"); + /* + * We are about to release the lockspace. By keeping live_gl + * locked here, we ensure that the next mounter coming along + * will be a "first" mounter which will perform recovery. + */ + goto skip_recovery; } - gfs2_glock_put(live_gl); /* drop extra reference we acquired */ - clear_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags); - /* * At this point our journal is evicted, so we need to get a new inode * for it. Once done, we need to call gfs2_find_jhead which diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c index ef9498a6e88a..34e9804e0f36 100644 --- a/fs/hfs/bfind.c +++ b/fs/hfs/bfind.c @@ -16,6 +16,9 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) { void *ptr; + if (!tree || !fd) + return -EINVAL; + fd->tree = tree; fd->bnode = NULL; ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL); diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index cb823a8a6ba9..e8cd1a31f247 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -15,6 +15,48 @@ #include "btree.h" +static inline +bool is_bnode_offset_valid(struct hfs_bnode *node, int off) +{ + bool is_valid = off < node->tree->node_size; + + if (!is_valid) { + pr_err("requested invalid offset: " + "NODE: id %u, type %#x, height %u, " + "node_size %u, offset %d\n", + node->this, node->type, node->height, + node->tree->node_size, off); + } + + return is_valid; +} + +static inline +int check_and_correct_requested_length(struct hfs_bnode *node, int off, int len) +{ + unsigned int node_size; + + if (!is_bnode_offset_valid(node, off)) + return 0; + + node_size = node->tree->node_size; + + if ((off + len) > node_size) { + int new_len = (int)node_size - off; + + pr_err("requested length has been corrected: " + "NODE: id %u, type %#x, height %u, " + "node_size %u, offset %d, " + "requested_len %d, corrected_len %d\n", + node->this, node->type, node->height, + node->tree->node_size, off, len, new_len); + + return new_len; + } + + return len; +} + void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) { struct page *page; @@ -22,6 +64,20 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) int bytes_read; int bytes_to_read; + if (!is_bnode_offset_valid(node, off)) + return; + + if (len == 0) { + pr_err("requested zero length: " + "NODE: id %u, type %#x, height %u, " + "node_size %u, offset %d, len %d\n", + node->this, node->type, node->height, + node->tree->node_size, off, len); + return; + } + + len = check_and_correct_requested_length(node, off, len); + off += node->page_offset; pagenum = off >> PAGE_SHIFT; off &= ~PAGE_MASK; /* compute page offset for the first page */ @@ -80,6 +136,20 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) { struct page *page; + if (!is_bnode_offset_valid(node, off)) + return; + + if (len == 0) { + pr_err("requested zero length: " + "NODE: id %u, type %#x, height %u, " + "node_size %u, offset %d, len %d\n", + node->this, node->type, node->height, + node->tree->node_size, off, len); + return; + } + + len = check_and_correct_requested_length(node, off, len); + off += node->page_offset; page = node->page[0]; @@ -104,6 +174,20 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) { struct page *page; + if (!is_bnode_offset_valid(node, off)) + return; + + if (len == 0) { + pr_err("requested zero length: " + "NODE: id %u, type %#x, height %u, " + "node_size %u, offset %d, len %d\n", + node->this, node->type, node->height, + node->tree->node_size, off, len); + return; + } + + len = check_and_correct_requested_length(node, off, len); + off += node->page_offset; page = node->page[0]; @@ -119,6 +203,10 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); if (!len) return; + + len = check_and_correct_requested_length(src_node, src, len); + len = check_and_correct_requested_length(dst_node, dst, len); + src += src_node->page_offset; dst += dst_node->page_offset; src_page = src_node->page[0]; @@ -136,6 +224,10 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); if (!len) return; + + len = check_and_correct_requested_length(node, src, len); + len = check_and_correct_requested_length(node, dst, len); + src += node->page_offset; dst += node->page_offset; page = node->page[0]; @@ -482,6 +574,7 @@ void hfs_bnode_put(struct hfs_bnode *node) if (test_bit(HFS_BNODE_DELETED, &node->flags)) { hfs_bnode_unhash(node); spin_unlock(&tree->hash_lock); + hfs_bnode_clear(node, 0, tree->node_size); hfs_bmap_free(node); hfs_bnode_free(node); return; diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 2fa4b1f8cc7f..e86e1e235658 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -21,8 +21,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke struct hfs_btree *tree; struct hfs_btree_header_rec *head; struct address_space *mapping; - struct page *page; + struct folio *folio; + struct buffer_head *bh; unsigned int size; + u16 dblock; + sector_t start_block; + loff_t offset; tree = kzalloc(sizeof(*tree), GFP_KERNEL); if (!tree) @@ -75,12 +79,40 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke unlock_new_inode(tree->inode); mapping = tree->inode->i_mapping; - page = read_mapping_page(mapping, 0, NULL); - if (IS_ERR(page)) + folio = filemap_grab_folio(mapping, 0); + if (IS_ERR(folio)) goto free_inode; + folio_zero_range(folio, 0, folio_size(folio)); + + dblock = hfs_ext_find_block(HFS_I(tree->inode)->first_extents, 0); + start_block = HFS_SB(sb)->fs_start + (dblock * HFS_SB(sb)->fs_div); + + size = folio_size(folio); + offset = 0; + while (size > 0) { + size_t len; + + bh = sb_bread(sb, start_block); + if (!bh) { + pr_err("unable to read tree header\n"); + goto put_folio; + } + + len = min_t(size_t, folio_size(folio), sb->s_blocksize); + memcpy_to_folio(folio, offset, bh->b_data, sb->s_blocksize); + + brelse(bh); + + start_block++; + offset += len; + size -= len; + } + + folio_mark_uptodate(folio); + /* Load the header */ - head = (struct hfs_btree_header_rec *)(kmap_local_page(page) + + head = (struct hfs_btree_header_rec *)(kmap_local_folio(folio, 0) + sizeof(struct hfs_bnode_desc)); tree->root = be32_to_cpu(head->root); tree->leaf_count = be32_to_cpu(head->leaf_count); @@ -95,22 +127,22 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke size = tree->node_size; if (!is_power_of_2(size)) - goto fail_page; + goto fail_folio; if (!tree->node_count) - goto fail_page; + goto fail_folio; switch (id) { case HFS_EXT_CNID: if (tree->max_key_len != HFS_MAX_EXT_KEYLEN) { pr_err("invalid extent max_key_len %d\n", tree->max_key_len); - goto fail_page; + goto fail_folio; } break; case HFS_CAT_CNID: if (tree->max_key_len != HFS_MAX_CAT_KEYLEN) { pr_err("invalid catalog max_key_len %d\n", tree->max_key_len); - goto fail_page; + goto fail_folio; } break; default: @@ -121,12 +153,15 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke tree->pages_per_bnode = (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT; kunmap_local(head); - put_page(page); + folio_unlock(folio); + folio_put(folio); return tree; -fail_page: +fail_folio: kunmap_local(head); - put_page(page); +put_folio: + folio_unlock(folio); + folio_put(folio); free_inode: tree->inode->i_mapping->a_ops = &hfs_aops; iput(tree->inode); diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index 4a0ce131e233..580c62981dbd 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -71,7 +71,7 @@ int hfs_ext_keycmp(const btree_key *key1, const btree_key *key2) * * Find a block within an extent record */ -static u16 hfs_ext_find_block(struct hfs_extent *ext, u16 off) +u16 hfs_ext_find_block(struct hfs_extent *ext, u16 off) { int i; u16 count; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index a0c7cb0f79fc..7c5a7ecfa246 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -190,6 +190,7 @@ extern const struct inode_operations hfs_dir_inode_operations; /* extent.c */ extern int hfs_ext_keycmp(const btree_key *, const btree_key *); +extern u16 hfs_ext_find_block(struct hfs_extent *ext, u16 off); extern int hfs_free_fork(struct super_block *, struct hfs_cat_file *, int); extern int hfs_ext_write_extent(struct inode *); extern int hfs_extend_file(struct inode *); @@ -201,7 +202,7 @@ extern int hfs_get_block(struct inode *, sector_t, struct buffer_head *, int); extern const struct address_space_operations hfs_aops; extern const struct address_space_operations hfs_btree_aops; -int hfs_write_begin(struct file *file, struct address_space *mapping, +int hfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata); extern struct inode *hfs_new_inode(struct inode *, const struct qstr *, umode_t); extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index a81ce7a740b9..bf4cb7e78396 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -44,12 +44,12 @@ static void hfs_write_failed(struct address_space *mapping, loff_t to) } } -int hfs_write_begin(struct file *file, struct address_space *mapping, +int hfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { int ret; - ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, + ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata, hfs_get_block, &HFS_I(mapping->host)->phys_size); if (unlikely(ret)) @@ -690,8 +690,9 @@ static const struct file_operations hfs_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, - .mmap = generic_file_mmap, + .mmap_prepare = generic_file_mmap_prepare, .splice_read = filemap_splice_read, + .splice_write = iter_file_splice_write, .fsync = hfs_file_fsync, .open = hfs_file_open, .release = hfs_file_release, diff --git a/fs/hfs/super.c b/fs/hfs/super.c index fe09c2093a93..388a318297ec 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -365,7 +365,7 @@ static int hfs_fill_super(struct super_block *sb, struct fs_context *fc) if (!root_inode) goto bail_no_root; - sb->s_d_op = &hfs_dentry_operations; + set_default_d_op(sb, &hfs_dentry_operations); res = -ENOMEM; sb->s_root = d_make_root(root_inode); if (!sb->s_root) diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 079ea80534f7..14f4995588ff 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -18,12 +18,68 @@ #include "hfsplus_fs.h" #include "hfsplus_raw.h" +static inline +bool is_bnode_offset_valid(struct hfs_bnode *node, int off) +{ + bool is_valid = off < node->tree->node_size; + + if (!is_valid) { + pr_err("requested invalid offset: " + "NODE: id %u, type %#x, height %u, " + "node_size %u, offset %d\n", + node->this, node->type, node->height, + node->tree->node_size, off); + } + + return is_valid; +} + +static inline +int check_and_correct_requested_length(struct hfs_bnode *node, int off, int len) +{ + unsigned int node_size; + + if (!is_bnode_offset_valid(node, off)) + return 0; + + node_size = node->tree->node_size; + + if ((off + len) > node_size) { + int new_len = (int)node_size - off; + + pr_err("requested length has been corrected: " + "NODE: id %u, type %#x, height %u, " + "node_size %u, offset %d, " + "requested_len %d, corrected_len %d\n", + node->this, node->type, node->height, + node->tree->node_size, off, len, new_len); + + return new_len; + } + + return len; +} + /* Copy a specified range of bytes from the raw data of a node */ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) { struct page **pagep; int l; + if (!is_bnode_offset_valid(node, off)) + return; + + if (len == 0) { + pr_err("requested zero length: " + "NODE: id %u, type %#x, height %u, " + "node_size %u, offset %d, len %d\n", + node->this, node->type, node->height, + node->tree->node_size, off, len); + return; + } + + len = check_and_correct_requested_length(node, off, len); + off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); off &= ~PAGE_MASK; @@ -81,6 +137,20 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) struct page **pagep; int l; + if (!is_bnode_offset_valid(node, off)) + return; + + if (len == 0) { + pr_err("requested zero length: " + "NODE: id %u, type %#x, height %u, " + "node_size %u, offset %d, len %d\n", + node->this, node->type, node->height, + node->tree->node_size, off, len); + return; + } + + len = check_and_correct_requested_length(node, off, len); + off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); off &= ~PAGE_MASK; @@ -109,6 +179,20 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) struct page **pagep; int l; + if (!is_bnode_offset_valid(node, off)) + return; + + if (len == 0) { + pr_err("requested zero length: " + "NODE: id %u, type %#x, height %u, " + "node_size %u, offset %d, len %d\n", + node->this, node->type, node->height, + node->tree->node_size, off, len); + return; + } + + len = check_and_correct_requested_length(node, off, len); + off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); off &= ~PAGE_MASK; @@ -133,6 +217,10 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); if (!len) return; + + len = check_and_correct_requested_length(src_node, src, len); + len = check_and_correct_requested_length(dst_node, dst, len); + src += src_node->page_offset; dst += dst_node->page_offset; src_page = src_node->page + (src >> PAGE_SHIFT); @@ -187,6 +275,10 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); if (!len) return; + + len = check_and_correct_requested_length(node, src, len); + len = check_and_correct_requested_length(node, dst, len); + src += node->page_offset; dst += node->page_offset; if (dst > src) { diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index a6d61685ae79..b1699b3c246a 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -342,9 +342,6 @@ static int hfsplus_free_extents(struct super_block *sb, int i; int err = 0; - /* Mapping the allocation file may lock the extent tree */ - WARN_ON(mutex_is_locked(&HFSPLUS_SB(sb)->ext_tree->tree_lock)); - hfsplus_dump_extent(extent); for (i = 0; i < 8; extent++, i++) { count = be32_to_cpu(extent->block_count); diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 2f089bff0095..96a5c24813dd 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -473,8 +473,10 @@ extern const struct address_space_operations hfsplus_aops; extern const struct address_space_operations hfsplus_btree_aops; extern const struct dentry_operations hfsplus_dentry_operations; -int hfsplus_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct folio **foliop, void **fsdata); +int hfsplus_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, struct folio **foliop, + void **fsdata); struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, umode_t mode); void hfsplus_delete_inode(struct inode *inode); @@ -489,9 +491,9 @@ int hfsplus_getattr(struct mnt_idmap *idmap, const struct path *path, unsigned int query_flags); int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); -int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int hfsplus_fileattr_get(struct dentry *dentry, struct file_kattr *fa); int hfsplus_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); /* ioctl.c */ long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index f331e9574217..b51a411ecd23 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -38,12 +38,14 @@ static void hfsplus_write_failed(struct address_space *mapping, loff_t to) } } -int hfsplus_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct folio **foliop, void **fsdata) +int hfsplus_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, + unsigned len, struct folio **foliop, + void **fsdata) { int ret; - ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, + ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata, hfsplus_get_block, &HFSPLUS_I(mapping->host)->phys_size); if (unlikely(ret)) @@ -366,8 +368,9 @@ static const struct file_operations hfsplus_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, - .mmap = generic_file_mmap, + .mmap_prepare = generic_file_mmap_prepare, .splice_read = filemap_splice_read, + .splice_write = iter_file_splice_write, .fsync = hfsplus_file_fsync, .open = hfsplus_file_open, .release = hfsplus_file_release, @@ -654,7 +657,7 @@ out: return res; } -int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int hfsplus_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct hfsplus_inode_info *hip = HFSPLUS_I(inode); @@ -673,7 +676,7 @@ int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa) } int hfsplus_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct hfsplus_inode_info *hip = HFSPLUS_I(inode); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 948b8aaee33e..86351bdc8985 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -222,8 +222,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) error2 = hfsplus_submit_bio(sb, sbi->part_start + HFSPLUS_VOLHEAD_SECTOR, - sbi->s_vhdr_buf, NULL, REQ_OP_WRITE | - REQ_SYNC); + sbi->s_vhdr_buf, NULL, REQ_OP_WRITE); if (!error) error = error2; if (!write_backup) @@ -231,8 +230,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) error2 = hfsplus_submit_bio(sb, sbi->part_start + sbi->sect_count - 2, - sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE | - REQ_SYNC); + sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE); if (!error) error2 = error; out: @@ -508,7 +506,7 @@ static int hfsplus_fill_super(struct super_block *sb, struct fs_context *fc) goto out_put_alloc_file; } - sb->s_d_op = &hfsplus_dentry_operations; + set_default_d_op(sb, &hfsplus_dentry_operations); sb->s_root = d_make_root(root); if (!sb->s_root) { err = -ENOMEM; diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index 73342c925a4b..36b6cf2a3abb 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -132,7 +132,14 @@ int hfsplus_uni2asc(struct super_block *sb, op = astr; ip = ustr->unicode; + ustrlen = be16_to_cpu(ustr->length); + if (ustrlen > HFSPLUS_MAX_STRLEN) { + ustrlen = HFSPLUS_MAX_STRLEN; + pr_err("invalid length %u has been corrected to %d\n", + be16_to_cpu(ustr->length), ustrlen); + } + len = *len_p; ce1 = NULL; compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 9a1a93e3888b..18dc3d254d21 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -172,7 +172,11 @@ check_attr_tree_state_again: return PTR_ERR(attr_file); } - BUG_ON(i_size_read(attr_file) != 0); + if (i_size_read(attr_file) != 0) { + err = -EIO; + pr_err("detected inconsistent attributes file, running fsck.hfsplus is recommended.\n"); + goto end_attr_file_creation; + } hip = HFSPLUS_I(attr_file); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 702c41317589..01e516175bcd 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -382,7 +382,7 @@ static const struct file_operations hostfs_file_fops = { .splice_write = iter_file_splice_write, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, - .mmap = generic_file_mmap, + .mmap_prepare = generic_file_mmap_prepare, .open = hostfs_open, .release = hostfs_file_release, .fsync = hostfs_fsync, @@ -445,7 +445,8 @@ static int hostfs_read_folio(struct file *file, struct folio *folio) return ret; } -static int hostfs_write_begin(struct file *file, struct address_space *mapping, +static int hostfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -458,7 +459,8 @@ static int hostfs_write_begin(struct file *file, struct address_space *mapping, return 0; } -static int hostfs_write_end(struct file *file, struct address_space *mapping, +static int hostfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { @@ -468,7 +470,7 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping, int err; buffer = kmap_local_folio(folio, from); - err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer, copied); + err = write_file(FILE_HOSTFS_I(iocb->ki_filp)->fd, &pos, buffer, copied); kunmap_local(buffer); if (!folio_test_uptodate(folio) && err == folio_size(folio)) @@ -933,7 +935,7 @@ static int hostfs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_blocksize_bits = 10; sb->s_magic = HOSTFS_SUPER_MAGIC; sb->s_op = &hostfs_sbops; - sb->s_d_op = &simple_dentry_operations; + sb->s_d_flags = DCACHE_DONTCACHE; sb->s_maxbytes = MAX_LFS_FILESIZE; err = super_setup_bdi(sb); if (err) diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 449a3fc1b8d9..263b5bbe1849 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -188,13 +188,14 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to) hpfs_unlock(inode->i_sb); } -static int hpfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int hpfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; - ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, + ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata, hpfs_get_block, &hpfs_i(mapping->host)->mmu_private); if (unlikely(ret)) @@ -203,13 +204,14 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping, return ret; } -static int hpfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int hpfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int err; - err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); + err = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); if (err < len) hpfs_write_failed(mapping, pos + len); if (!(err < 0)) { @@ -255,7 +257,7 @@ const struct file_operations hpfs_file_ops = .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, - .mmap = generic_file_mmap, + .mmap_prepare = generic_file_mmap_prepare, .release = hpfs_file_release, .fsync = hpfs_file_fsync, .splice_read = filemap_splice_read, diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 27567920abe4..42b779b4d87f 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -554,7 +554,7 @@ static int hpfs_fill_super(struct super_block *s, struct fs_context *fc) /* Fill superblock stuff */ s->s_magic = HPFS_SUPER_MAGIC; s->s_op = &hpfs_sops; - s->s_d_op = &hpfs_dentry_operations; + set_default_d_op(s, &hpfs_dentry_operations); s->s_time_min = local_to_gmt(s, 0); s->s_time_max = local_to_gmt(s, U32_MAX); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index e4de5425838d..09d4baef29cf 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -150,10 +150,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (inode->i_flags & S_PRIVATE) vm_flags |= VM_NORESERVE; - if (!hugetlb_reserve_pages(inode, + if (hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), len >> huge_page_shift(h), vma, - vm_flags)) + vm_flags) < 0) goto out; ret = 0; @@ -179,12 +179,8 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (len & ~huge_page_mask(h)) return -EINVAL; - if (flags & MAP_FIXED) { - if (addr & ~huge_page_mask(h)) - return -EINVAL; - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - } + if ((flags & MAP_FIXED) && (addr & ~huge_page_mask(h))) + return -EINVAL; if (addr) addr0 = ALIGN(addr, huge_page_size(h)); @@ -311,7 +307,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) return retval; } -static int hugetlbfs_write_begin(struct file *file, +static int hugetlbfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) @@ -319,9 +315,10 @@ static int hugetlbfs_write_begin(struct file *file, return -EINVAL; } -static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int hugetlbfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { BUG(); return -EINVAL; @@ -1433,6 +1430,7 @@ hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_blocksize_bits = huge_page_shift(ctx->hstate); sb->s_magic = HUGETLBFS_MAGIC; sb->s_op = &hugetlbfs_ops; + sb->s_d_flags = DCACHE_DONTCACHE; sb->s_time_gran = 1; /* @@ -1561,9 +1559,9 @@ struct file *hugetlb_file_setup(const char *name, size_t size, inode->i_size = size; clear_nlink(inode); - if (!hugetlb_reserve_pages(inode, 0, + if (hugetlb_reserve_pages(inode, 0, size >> huge_page_shift(hstate_inode(inode)), NULL, - acctflag)) + acctflag) < 0) file = ERR_PTR(-ENOMEM); else file = alloc_file_pseudo(inode, mnt, name, O_RDWR, @@ -1587,7 +1585,7 @@ static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h) } else { struct hugetlbfs_fs_context *ctx = fc->fs_private; ctx->hstate = h; - mnt = fc_mount(fc); + mnt = fc_mount_longterm(fc); put_fs_context(fc); } if (IS_ERR(mnt)) diff --git a/fs/inode.c b/fs/inode.c index 99318b157a9a..01ebdc40021e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -865,12 +865,12 @@ static void dispose_list(struct list_head *head) */ void evict_inodes(struct super_block *sb) { - struct inode *inode, *next; + struct inode *inode; LIST_HEAD(dispose); again: spin_lock(&sb->s_inode_list_lock); - list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { if (atomic_read(&inode->i_count)) continue; @@ -1158,9 +1158,8 @@ void lockdep_annotate_inode_mutex_key(struct inode *inode) /* Set new key only if filesystem hasn't already changed it */ if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) { /* - * ensure nobody is actually holding i_mutex + * ensure nobody is actually holding i_rwsem */ - // mutex_destroy(&inode->i_mutex); init_rwsem(&inode->i_rwsem); lockdep_set_class(&inode->i_rwsem, &type->i_mutex_dir_key); @@ -2615,7 +2614,7 @@ EXPORT_SYMBOL(inode_dio_finished); * proceed with a truncate or equivalent operation. * * Must be called under a lock that serializes taking new references - * to i_dio_count, usually by inode->i_mutex. + * to i_dio_count, usually by inode->i_rwsem. */ void inode_dio_wait(struct inode *inode) { @@ -2633,7 +2632,7 @@ EXPORT_SYMBOL(inode_dio_wait_interruptible); /* * inode_set_flags - atomically set some inode flags * - * Note: the caller should be holding i_mutex, or else be sure that + * Note: the caller should be holding i_rwsem exclusively, or else be sure that * they have exclusive access to the inode structure (i.e., while the * inode is being instantiated). The reason for the cmpxchg() loop * --- which wouldn't be necessary if all code paths which modify @@ -2641,7 +2640,7 @@ EXPORT_SYMBOL(inode_dio_wait_interruptible); * code path which doesn't today so we use cmpxchg() out of an abundance * of caution. * - * In the long run, i_mutex is overkill, and we should probably look + * In the long run, i_rwsem is overkill, and we should probably look * at using the i_lock spinlock to protect i_flags, and then make sure * it is so documented in include/linux/fs.h and that all code follows * the locking convention!! diff --git a/fs/internal.h b/fs/internal.h index 393f6c5c24f6..38e8aab27bbd 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -101,6 +101,7 @@ extern void chroot_fs_refs(const struct path *, const struct path *); struct file *alloc_empty_file(int flags, const struct cred *cred); struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred); struct file *alloc_empty_backing_file(int flags, const struct cred *cred); +void backing_file_set_user_path(struct file *f, const struct path *path); static inline void file_put_write_access(struct file *file) { @@ -322,12 +323,15 @@ struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns); struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap); void mnt_idmap_put(struct mnt_idmap *idmap); struct stashed_operations { + struct dentry *(*stash_dentry)(struct dentry **stashed, + struct dentry *dentry); void (*put_data)(void *data); int (*init_inode)(struct inode *inode, void *data); }; int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, struct path *path); void stashed_dentry_prune(struct dentry *dentry); +struct dentry *stash_dentry(struct dentry **stashed, struct dentry *dentry); struct dentry *stashed_dentry_get(struct dentry **stashed); /** * path_mounted - check whether path is mounted @@ -350,3 +354,4 @@ int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path, unsigned int query_flags); int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); +void pidfs_get_root(struct path *path); diff --git a/fs/ioctl.c b/fs/ioctl.c index 69107a245b4c..0248cb8db2d3 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -453,315 +453,6 @@ out: return ret; } -/** - * fileattr_fill_xflags - initialize fileattr with xflags - * @fa: fileattr pointer - * @xflags: FS_XFLAG_* flags - * - * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags). All - * other fields are zeroed. - */ -void fileattr_fill_xflags(struct fileattr *fa, u32 xflags) -{ - memset(fa, 0, sizeof(*fa)); - fa->fsx_valid = true; - fa->fsx_xflags = xflags; - if (fa->fsx_xflags & FS_XFLAG_IMMUTABLE) - fa->flags |= FS_IMMUTABLE_FL; - if (fa->fsx_xflags & FS_XFLAG_APPEND) - fa->flags |= FS_APPEND_FL; - if (fa->fsx_xflags & FS_XFLAG_SYNC) - fa->flags |= FS_SYNC_FL; - if (fa->fsx_xflags & FS_XFLAG_NOATIME) - fa->flags |= FS_NOATIME_FL; - if (fa->fsx_xflags & FS_XFLAG_NODUMP) - fa->flags |= FS_NODUMP_FL; - if (fa->fsx_xflags & FS_XFLAG_DAX) - fa->flags |= FS_DAX_FL; - if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT) - fa->flags |= FS_PROJINHERIT_FL; -} -EXPORT_SYMBOL(fileattr_fill_xflags); - -/** - * fileattr_fill_flags - initialize fileattr with flags - * @fa: fileattr pointer - * @flags: FS_*_FL flags - * - * Set ->flags, ->flags_valid and ->fsx_xflags (translated flags). - * All other fields are zeroed. - */ -void fileattr_fill_flags(struct fileattr *fa, u32 flags) -{ - memset(fa, 0, sizeof(*fa)); - fa->flags_valid = true; - fa->flags = flags; - if (fa->flags & FS_SYNC_FL) - fa->fsx_xflags |= FS_XFLAG_SYNC; - if (fa->flags & FS_IMMUTABLE_FL) - fa->fsx_xflags |= FS_XFLAG_IMMUTABLE; - if (fa->flags & FS_APPEND_FL) - fa->fsx_xflags |= FS_XFLAG_APPEND; - if (fa->flags & FS_NODUMP_FL) - fa->fsx_xflags |= FS_XFLAG_NODUMP; - if (fa->flags & FS_NOATIME_FL) - fa->fsx_xflags |= FS_XFLAG_NOATIME; - if (fa->flags & FS_DAX_FL) - fa->fsx_xflags |= FS_XFLAG_DAX; - if (fa->flags & FS_PROJINHERIT_FL) - fa->fsx_xflags |= FS_XFLAG_PROJINHERIT; -} -EXPORT_SYMBOL(fileattr_fill_flags); - -/** - * vfs_fileattr_get - retrieve miscellaneous file attributes - * @dentry: the object to retrieve from - * @fa: fileattr pointer - * - * Call i_op->fileattr_get() callback, if exists. - * - * Return: 0 on success, or a negative error on failure. - */ -int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) -{ - struct inode *inode = d_inode(dentry); - - if (!inode->i_op->fileattr_get) - return -ENOIOCTLCMD; - - return inode->i_op->fileattr_get(dentry, fa); -} -EXPORT_SYMBOL(vfs_fileattr_get); - -/** - * copy_fsxattr_to_user - copy fsxattr to userspace. - * @fa: fileattr pointer - * @ufa: fsxattr user pointer - * - * Return: 0 on success, or -EFAULT on failure. - */ -int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa) -{ - struct fsxattr xfa; - - memset(&xfa, 0, sizeof(xfa)); - xfa.fsx_xflags = fa->fsx_xflags; - xfa.fsx_extsize = fa->fsx_extsize; - xfa.fsx_nextents = fa->fsx_nextents; - xfa.fsx_projid = fa->fsx_projid; - xfa.fsx_cowextsize = fa->fsx_cowextsize; - - if (copy_to_user(ufa, &xfa, sizeof(xfa))) - return -EFAULT; - - return 0; -} -EXPORT_SYMBOL(copy_fsxattr_to_user); - -static int copy_fsxattr_from_user(struct fileattr *fa, - struct fsxattr __user *ufa) -{ - struct fsxattr xfa; - - if (copy_from_user(&xfa, ufa, sizeof(xfa))) - return -EFAULT; - - fileattr_fill_xflags(fa, xfa.fsx_xflags); - fa->fsx_extsize = xfa.fsx_extsize; - fa->fsx_nextents = xfa.fsx_nextents; - fa->fsx_projid = xfa.fsx_projid; - fa->fsx_cowextsize = xfa.fsx_cowextsize; - - return 0; -} - -/* - * Generic function to check FS_IOC_FSSETXATTR/FS_IOC_SETFLAGS values and reject - * any invalid configurations. - * - * Note: must be called with inode lock held. - */ -static int fileattr_set_prepare(struct inode *inode, - const struct fileattr *old_ma, - struct fileattr *fa) -{ - int err; - - /* - * The IMMUTABLE and APPEND_ONLY flags can only be changed by - * the relevant capability. - */ - if ((fa->flags ^ old_ma->flags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) && - !capable(CAP_LINUX_IMMUTABLE)) - return -EPERM; - - err = fscrypt_prepare_setflags(inode, old_ma->flags, fa->flags); - if (err) - return err; - - /* - * Project Quota ID state is only allowed to change from within the init - * namespace. Enforce that restriction only if we are trying to change - * the quota ID state. Everything else is allowed in user namespaces. - */ - if (current_user_ns() != &init_user_ns) { - if (old_ma->fsx_projid != fa->fsx_projid) - return -EINVAL; - if ((old_ma->fsx_xflags ^ fa->fsx_xflags) & - FS_XFLAG_PROJINHERIT) - return -EINVAL; - } else { - /* - * Caller is allowed to change the project ID. If it is being - * changed, make sure that the new value is valid. - */ - if (old_ma->fsx_projid != fa->fsx_projid && - !projid_valid(make_kprojid(&init_user_ns, fa->fsx_projid))) - return -EINVAL; - } - - /* Check extent size hints. */ - if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode)) - return -EINVAL; - - if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) && - !S_ISDIR(inode->i_mode)) - return -EINVAL; - - if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) && - !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) - return -EINVAL; - - /* - * It is only valid to set the DAX flag on regular files and - * directories on filesystems. - */ - if ((fa->fsx_xflags & FS_XFLAG_DAX) && - !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) - return -EINVAL; - - /* Extent size hints of zero turn off the flags. */ - if (fa->fsx_extsize == 0) - fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT); - if (fa->fsx_cowextsize == 0) - fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; - - return 0; -} - -/** - * vfs_fileattr_set - change miscellaneous file attributes - * @idmap: idmap of the mount - * @dentry: the object to change - * @fa: fileattr pointer - * - * After verifying permissions, call i_op->fileattr_set() callback, if - * exists. - * - * Verifying attributes involves retrieving current attributes with - * i_op->fileattr_get(), this also allows initializing attributes that have - * not been set by the caller to current values. Inode lock is held - * thoughout to prevent racing with another instance. - * - * Return: 0 on success, or a negative error on failure. - */ -int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, - struct fileattr *fa) -{ - struct inode *inode = d_inode(dentry); - struct fileattr old_ma = {}; - int err; - - if (!inode->i_op->fileattr_set) - return -ENOIOCTLCMD; - - if (!inode_owner_or_capable(idmap, inode)) - return -EPERM; - - inode_lock(inode); - err = vfs_fileattr_get(dentry, &old_ma); - if (!err) { - /* initialize missing bits from old_ma */ - if (fa->flags_valid) { - fa->fsx_xflags |= old_ma.fsx_xflags & ~FS_XFLAG_COMMON; - fa->fsx_extsize = old_ma.fsx_extsize; - fa->fsx_nextents = old_ma.fsx_nextents; - fa->fsx_projid = old_ma.fsx_projid; - fa->fsx_cowextsize = old_ma.fsx_cowextsize; - } else { - fa->flags |= old_ma.flags & ~FS_COMMON_FL; - } - err = fileattr_set_prepare(inode, &old_ma, fa); - if (!err) - err = inode->i_op->fileattr_set(idmap, dentry, fa); - } - inode_unlock(inode); - - return err; -} -EXPORT_SYMBOL(vfs_fileattr_set); - -static int ioctl_getflags(struct file *file, unsigned int __user *argp) -{ - struct fileattr fa = { .flags_valid = true }; /* hint only */ - int err; - - err = vfs_fileattr_get(file->f_path.dentry, &fa); - if (!err) - err = put_user(fa.flags, argp); - return err; -} - -static int ioctl_setflags(struct file *file, unsigned int __user *argp) -{ - struct mnt_idmap *idmap = file_mnt_idmap(file); - struct dentry *dentry = file->f_path.dentry; - struct fileattr fa; - unsigned int flags; - int err; - - err = get_user(flags, argp); - if (!err) { - err = mnt_want_write_file(file); - if (!err) { - fileattr_fill_flags(&fa, flags); - err = vfs_fileattr_set(idmap, dentry, &fa); - mnt_drop_write_file(file); - } - } - return err; -} - -static int ioctl_fsgetxattr(struct file *file, void __user *argp) -{ - struct fileattr fa = { .fsx_valid = true }; /* hint only */ - int err; - - err = vfs_fileattr_get(file->f_path.dentry, &fa); - if (!err) - err = copy_fsxattr_to_user(&fa, argp); - - return err; -} - -static int ioctl_fssetxattr(struct file *file, void __user *argp) -{ - struct mnt_idmap *idmap = file_mnt_idmap(file); - struct dentry *dentry = file->f_path.dentry; - struct fileattr fa; - int err; - - err = copy_fsxattr_from_user(&fa, argp); - if (!err) { - err = mnt_want_write_file(file); - if (!err) { - err = vfs_fileattr_set(idmap, dentry, &fa); - mnt_drop_write_file(file); - } - } - return err; -} - static int ioctl_getfsuuid(struct file *file, void __user *argp) { struct super_block *sb = file_inode(file)->i_sb; diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile index 69e8ebb41302..f7e1c8534c46 100644 --- a/fs/iomap/Makefile +++ b/fs/iomap/Makefile @@ -9,9 +9,9 @@ ccflags-y += -I $(src) # needed for trace events obj-$(CONFIG_FS_IOMAP) += iomap.o iomap-y += trace.o \ - iter.o -iomap-$(CONFIG_BLOCK) += buffered-io.o \ - direct-io.o \ + iter.o \ + buffered-io.o +iomap-$(CONFIG_BLOCK) += direct-io.o \ ioend.o \ fiemap.o \ seek.o diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 233abf598f65..fd827398afd2 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -3,20 +3,11 @@ * Copyright (C) 2010 Red Hat, Inc. * Copyright (C) 2016-2023 Christoph Hellwig. */ -#include <linux/module.h> -#include <linux/compiler.h> -#include <linux/fs.h> #include <linux/iomap.h> -#include <linux/pagemap.h> -#include <linux/uio.h> #include <linux/buffer_head.h> -#include <linux/dax.h> #include <linux/writeback.h> #include <linux/swap.h> -#include <linux/bio.h> -#include <linux/sched/signal.h> #include <linux/migrate.h> -#include "internal.h" #include "trace.h" #include "../internal.h" @@ -71,6 +62,9 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off, unsigned long flags; bool uptodate = true; + if (folio_test_uptodate(folio)) + return; + if (ifs) { spin_lock_irqsave(&ifs->state_lock, flags); uptodate = ifs_set_range_uptodate(folio, ifs, off, len); @@ -284,6 +278,46 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, *lenp = plen; } +static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter, + loff_t pos) +{ + const struct iomap *srcmap = iomap_iter_srcmap(iter); + + return srcmap->type != IOMAP_MAPPED || + (srcmap->flags & IOMAP_F_NEW) || + pos >= i_size_read(iter->inode); +} + +/** + * iomap_read_inline_data - copy inline data into the page cache + * @iter: iteration structure + * @folio: folio to copy to + * + * Copy the inline data in @iter into @folio and zero out the rest of the folio. + * Only a single IOMAP_INLINE extent is allowed at the end of each file. + * Returns zero for success to complete the read, or the usual negative errno. + */ +static int iomap_read_inline_data(const struct iomap_iter *iter, + struct folio *folio) +{ + const struct iomap *iomap = iomap_iter_srcmap(iter); + size_t size = i_size_read(iter->inode) - iomap->offset; + size_t offset = offset_in_folio(folio, iomap->offset); + + if (folio_test_uptodate(folio)) + return 0; + + if (WARN_ON_ONCE(size > iomap->length)) + return -EIO; + if (offset > 0) + ifs_alloc(iter->inode, folio, iter->flags); + + folio_fill_tail(folio, offset, iomap->inline_data, size); + iomap_set_range_uptodate(folio, offset, folio_size(folio) - offset); + return 0; +} + +#ifdef CONFIG_BLOCK static void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, int error) { @@ -323,45 +357,6 @@ struct iomap_readpage_ctx { struct readahead_control *rac; }; -/** - * iomap_read_inline_data - copy inline data into the page cache - * @iter: iteration structure - * @folio: folio to copy to - * - * Copy the inline data in @iter into @folio and zero out the rest of the folio. - * Only a single IOMAP_INLINE extent is allowed at the end of each file. - * Returns zero for success to complete the read, or the usual negative errno. - */ -static int iomap_read_inline_data(const struct iomap_iter *iter, - struct folio *folio) -{ - const struct iomap *iomap = iomap_iter_srcmap(iter); - size_t size = i_size_read(iter->inode) - iomap->offset; - size_t offset = offset_in_folio(folio, iomap->offset); - - if (folio_test_uptodate(folio)) - return 0; - - if (WARN_ON_ONCE(size > iomap->length)) - return -EIO; - if (offset > 0) - ifs_alloc(iter->inode, folio, iter->flags); - - folio_fill_tail(folio, offset, iomap->inline_data, size); - iomap_set_range_uptodate(folio, offset, folio_size(folio) - offset); - return 0; -} - -static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter, - loff_t pos) -{ - const struct iomap *srcmap = iomap_iter_srcmap(iter); - - return srcmap->type != IOMAP_MAPPED || - (srcmap->flags & IOMAP_F_NEW) || - pos >= i_size_read(iter->inode); -} - static int iomap_readpage_iter(struct iomap_iter *iter, struct iomap_readpage_ctx *ctx) { @@ -554,6 +549,27 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) } EXPORT_SYMBOL_GPL(iomap_readahead); +static int iomap_read_folio_range(const struct iomap_iter *iter, + struct folio *folio, loff_t pos, size_t len) +{ + const struct iomap *srcmap = iomap_iter_srcmap(iter); + struct bio_vec bvec; + struct bio bio; + + bio_init(&bio, srcmap->bdev, &bvec, 1, REQ_OP_READ); + bio.bi_iter.bi_sector = iomap_sector(srcmap, pos); + bio_add_folio_nofail(&bio, folio, len, offset_in_folio(folio, pos)); + return submit_bio_wait(&bio); +} +#else +static int iomap_read_folio_range(const struct iomap_iter *iter, + struct folio *folio, loff_t pos, size_t len) +{ + WARN_ON_ONCE(1); + return -EIO; +} +#endif /* CONFIG_BLOCK */ + /* * iomap_is_partially_uptodate checks whether blocks within a folio are * uptodate or not. @@ -667,22 +683,10 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) pos + len - 1); } -static int iomap_read_folio_sync(loff_t block_start, struct folio *folio, - size_t poff, size_t plen, const struct iomap *iomap) -{ - struct bio_vec bvec; - struct bio bio; - - bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ); - bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); - bio_add_folio_nofail(&bio, folio, plen, poff); - return submit_bio_wait(&bio); -} - -static int __iomap_write_begin(const struct iomap_iter *iter, size_t len, +static int __iomap_write_begin(const struct iomap_iter *iter, + const struct iomap_write_ops *write_ops, size_t len, struct folio *folio) { - const struct iomap *srcmap = iomap_iter_srcmap(iter); struct iomap_folio_state *ifs; loff_t pos = iter->pos; loff_t block_size = i_blocksize(iter->inode); @@ -731,8 +735,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter, size_t len, if (iter->flags & IOMAP_NOWAIT) return -EAGAIN; - status = iomap_read_folio_sync(block_start, folio, - poff, plen, srcmap); + if (write_ops && write_ops->read_folio_range) + status = write_ops->read_folio_range(iter, + folio, block_start, plen); + else + status = iomap_read_folio_range(iter, + folio, block_start, plen); if (status) return status; } @@ -742,28 +750,27 @@ static int __iomap_write_begin(const struct iomap_iter *iter, size_t len, return 0; } -static struct folio *__iomap_get_folio(struct iomap_iter *iter, size_t len) +static struct folio *__iomap_get_folio(struct iomap_iter *iter, + const struct iomap_write_ops *write_ops, size_t len) { - const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; loff_t pos = iter->pos; if (!mapping_large_folio_support(iter->inode->i_mapping)) len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); - if (folio_ops && folio_ops->get_folio) - return folio_ops->get_folio(iter, pos, len); - else - return iomap_get_folio(iter, pos, len); + if (write_ops && write_ops->get_folio) + return write_ops->get_folio(iter, pos, len); + return iomap_get_folio(iter, pos, len); } -static void __iomap_put_folio(struct iomap_iter *iter, size_t ret, +static void __iomap_put_folio(struct iomap_iter *iter, + const struct iomap_write_ops *write_ops, size_t ret, struct folio *folio) { - const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; loff_t pos = iter->pos; - if (folio_ops && folio_ops->put_folio) { - folio_ops->put_folio(iter->inode, pos, ret, folio); + if (write_ops && write_ops->put_folio) { + write_ops->put_folio(iter->inode, pos, ret, folio); } else { folio_unlock(folio); folio_put(folio); @@ -800,10 +807,10 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter, * offset, and length. Callers can optionally pass a max length *plen, * otherwise init to zero. */ -static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop, +static int iomap_write_begin(struct iomap_iter *iter, + const struct iomap_write_ops *write_ops, struct folio **foliop, size_t *poffset, u64 *plen) { - const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos = iter->pos; u64 len = min_t(u64, SIZE_MAX, iomap_length(iter)); @@ -818,7 +825,7 @@ static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop, if (fatal_signal_pending(current)) return -EINTR; - folio = __iomap_get_folio(iter, len); + folio = __iomap_get_folio(iter, write_ops, len); if (IS_ERR(folio)) return PTR_ERR(folio); @@ -832,8 +839,8 @@ static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop, * could do the wrong thing here (zero a page range incorrectly or fail * to zero) and corrupt data. */ - if (folio_ops && folio_ops->iomap_valid) { - bool iomap_valid = folio_ops->iomap_valid(iter->inode, + if (write_ops && write_ops->iomap_valid) { + bool iomap_valid = write_ops->iomap_valid(iter->inode, &iter->iomap); if (!iomap_valid) { iter->iomap.flags |= IOMAP_F_STALE; @@ -849,7 +856,7 @@ static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop, else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) status = __block_write_begin_int(folio, pos, len, NULL, srcmap); else - status = __iomap_write_begin(iter, len, folio); + status = __iomap_write_begin(iter, write_ops, len, folio); if (unlikely(status)) goto out_unlock; @@ -859,8 +866,7 @@ static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop, return 0; out_unlock: - __iomap_put_folio(iter, 0, folio); - + __iomap_put_folio(iter, write_ops, 0, folio); return status; } @@ -923,8 +929,7 @@ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied, if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { size_t bh_written; - bh_written = block_write_end(NULL, iter->inode->i_mapping, pos, - len, copied, folio, NULL); + bh_written = block_write_end(pos, len, copied, folio); WARN_ON_ONCE(bh_written != copied && bh_written != 0); return bh_written == copied; } @@ -932,7 +937,8 @@ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied, return __iomap_write_end(iter->inode, pos, len, copied, folio); } -static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) +static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i, + const struct iomap_write_ops *write_ops) { ssize_t total_written = 0; int status = 0; @@ -976,7 +982,8 @@ retry: break; } - status = iomap_write_begin(iter, &folio, &offset, &bytes); + status = iomap_write_begin(iter, write_ops, &folio, &offset, + &bytes); if (unlikely(status)) { iomap_write_failed(iter->inode, iter->pos, bytes); break; @@ -1005,7 +1012,7 @@ retry: i_size_write(iter->inode, pos + written); iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; } - __iomap_put_folio(iter, written, folio); + __iomap_put_folio(iter, write_ops, written, folio); if (old_size < pos) pagecache_isize_extended(iter->inode, old_size, pos); @@ -1038,7 +1045,8 @@ retry: ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, - const struct iomap_ops *ops, void *private) + const struct iomap_ops *ops, + const struct iomap_write_ops *write_ops, void *private) { struct iomap_iter iter = { .inode = iocb->ki_filp->f_mapping->host, @@ -1055,7 +1063,7 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, iter.flags |= IOMAP_DONTCACHE; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.status = iomap_write_iter(&iter, i); + iter.status = iomap_write_iter(&iter, i, write_ops); if (unlikely(iter.pos == iocb->ki_pos)) return ret; @@ -1289,7 +1297,8 @@ void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, } EXPORT_SYMBOL_GPL(iomap_write_delalloc_release); -static int iomap_unshare_iter(struct iomap_iter *iter) +static int iomap_unshare_iter(struct iomap_iter *iter, + const struct iomap_write_ops *write_ops) { struct iomap *iomap = &iter->iomap; u64 bytes = iomap_length(iter); @@ -1304,14 +1313,15 @@ static int iomap_unshare_iter(struct iomap_iter *iter) bool ret; bytes = min_t(u64, SIZE_MAX, bytes); - status = iomap_write_begin(iter, &folio, &offset, &bytes); + status = iomap_write_begin(iter, write_ops, &folio, &offset, + &bytes); if (unlikely(status)) return status; if (iomap->flags & IOMAP_F_STALE) break; ret = iomap_write_end(iter, bytes, bytes, folio); - __iomap_put_folio(iter, bytes, folio); + __iomap_put_folio(iter, write_ops, bytes, folio); if (WARN_ON_ONCE(!ret)) return -EIO; @@ -1329,7 +1339,8 @@ static int iomap_unshare_iter(struct iomap_iter *iter) int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, - const struct iomap_ops *ops) + const struct iomap_ops *ops, + const struct iomap_write_ops *write_ops) { struct iomap_iter iter = { .inode = inode, @@ -1344,7 +1355,7 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, iter.len = min(len, size - pos); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.status = iomap_unshare_iter(&iter); + iter.status = iomap_unshare_iter(&iter, write_ops); return ret; } EXPORT_SYMBOL_GPL(iomap_file_unshare); @@ -1363,7 +1374,8 @@ static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i) return filemap_write_and_wait_range(mapping, i->pos, end); } -static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) +static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero, + const struct iomap_write_ops *write_ops) { u64 bytes = iomap_length(iter); int status; @@ -1374,7 +1386,8 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) bool ret; bytes = min_t(u64, SIZE_MAX, bytes); - status = iomap_write_begin(iter, &folio, &offset, &bytes); + status = iomap_write_begin(iter, write_ops, &folio, &offset, + &bytes); if (status) return status; if (iter->iomap.flags & IOMAP_F_STALE) @@ -1387,7 +1400,7 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) folio_mark_accessed(folio); ret = iomap_write_end(iter, bytes, bytes, folio); - __iomap_put_folio(iter, bytes, folio); + __iomap_put_folio(iter, write_ops, bytes, folio); if (WARN_ON_ONCE(!ret)) return -EIO; @@ -1403,7 +1416,8 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, - const struct iomap_ops *ops, void *private) + const struct iomap_ops *ops, + const struct iomap_write_ops *write_ops, void *private) { struct iomap_iter iter = { .inode = inode, @@ -1433,7 +1447,8 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) { iter.len = plen; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.status = iomap_zero_iter(&iter, did_zero); + iter.status = iomap_zero_iter(&iter, did_zero, + write_ops); iter.len = len - (iter.pos - pos); if (ret || !iter.len) @@ -1464,7 +1479,7 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, continue; } - iter.status = iomap_zero_iter(&iter, did_zero); + iter.status = iomap_zero_iter(&iter, did_zero, write_ops); } return ret; } @@ -1472,7 +1487,8 @@ EXPORT_SYMBOL_GPL(iomap_zero_range); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - const struct iomap_ops *ops, void *private) + const struct iomap_ops *ops, + const struct iomap_write_ops *write_ops, void *private) { unsigned int blocksize = i_blocksize(inode); unsigned int off = pos & (blocksize - 1); @@ -1481,7 +1497,7 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, if (!off) return 0; return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops, - private); + write_ops, private); } EXPORT_SYMBOL_GPL(iomap_truncate_page); @@ -1535,278 +1551,54 @@ out_unlock: } EXPORT_SYMBOL_GPL(iomap_page_mkwrite); -static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, +void iomap_start_folio_write(struct inode *inode, struct folio *folio, size_t len) { struct iomap_folio_state *ifs = folio->private; WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs); - WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0); - - if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending)) - folio_end_writeback(folio); -} - -/* - * We're now finished for good with this ioend structure. Update the page - * state, release holds on bios, and finally free up memory. Do not use the - * ioend after this. - */ -u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend) -{ - struct inode *inode = ioend->io_inode; - struct bio *bio = &ioend->io_bio; - struct folio_iter fi; - u32 folio_count = 0; - - if (ioend->io_error) { - mapping_set_error(inode->i_mapping, ioend->io_error); - if (!bio_flagged(bio, BIO_QUIET)) { - pr_err_ratelimited( -"%s: writeback error on inode %lu, offset %lld, sector %llu", - inode->i_sb->s_id, inode->i_ino, - ioend->io_offset, ioend->io_sector); - } - } - - /* walk all folios in bio, ending page IO on them */ - bio_for_each_folio_all(fi, bio) { - iomap_finish_folio_write(inode, fi.folio, fi.length); - folio_count++; - } - - bio_put(bio); /* frees the ioend */ - return folio_count; -} - -static void iomap_writepage_end_bio(struct bio *bio) -{ - struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); - - ioend->io_error = blk_status_to_errno(bio->bi_status); - iomap_finish_ioend_buffered(ioend); -} - -/* - * Submit an ioend. - * - * If @error is non-zero, it means that we have a situation where some part of - * the submission process has failed after we've marked pages for writeback. - * We cannot cancel ioend directly in that case, so call the bio end I/O handler - * with the error status here to run the normal I/O completion handler to clear - * the writeback bit and let the file system proess the errors. - */ -static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) -{ - if (!wpc->ioend) - return error; - - /* - * Let the file systems prepare the I/O submission and hook in an I/O - * comletion handler. This also needs to happen in case after a - * failure happened so that the file system end I/O handler gets called - * to clean up. - */ - if (wpc->ops->submit_ioend) { - error = wpc->ops->submit_ioend(wpc, error); - } else { - if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE)) - error = -EIO; - if (!error) - submit_bio(&wpc->ioend->io_bio); - } - - if (error) { - wpc->ioend->io_bio.bi_status = errno_to_blk_status(error); - bio_endio(&wpc->ioend->io_bio); - } - - wpc->ioend = NULL; - return error; -} - -static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, - struct writeback_control *wbc, struct inode *inode, loff_t pos, - u16 ioend_flags) -{ - struct bio *bio; - - bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS, - REQ_OP_WRITE | wbc_to_write_flags(wbc), - GFP_NOFS, &iomap_ioend_bioset); - bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos); - bio->bi_end_io = iomap_writepage_end_bio; - bio->bi_write_hint = inode->i_write_hint; - wbc_init_bio(wbc, bio); - wpc->nr_folios = 0; - return iomap_init_ioend(inode, bio, pos, ioend_flags); -} - -static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos, - u16 ioend_flags) -{ - if (ioend_flags & IOMAP_IOEND_BOUNDARY) - return false; - if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) != - (wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) - return false; - if (pos != wpc->ioend->io_offset + wpc->ioend->io_size) - return false; - if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) && - iomap_sector(&wpc->iomap, pos) != - bio_end_sector(&wpc->ioend->io_bio)) - return false; - /* - * Limit ioend bio chain lengths to minimise IO completion latency. This - * also prevents long tight loops ending page writeback on all the - * folios in the ioend. - */ - if (wpc->nr_folios >= IOEND_BATCH_SIZE) - return false; - return true; + if (ifs) + atomic_add(len, &ifs->write_bytes_pending); } +EXPORT_SYMBOL_GPL(iomap_start_folio_write); -/* - * Test to see if we have an existing ioend structure that we could append to - * first; otherwise finish off the current ioend and start another. - * - * If a new ioend is created and cached, the old ioend is submitted to the block - * layer instantly. Batching optimisations are provided by higher level block - * plugging. - * - * At the end of a writeback pass, there will be a cached ioend remaining on the - * writepage context that the caller will need to submit. - */ -static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, - struct writeback_control *wbc, struct folio *folio, - struct inode *inode, loff_t pos, loff_t end_pos, - unsigned len) +void iomap_finish_folio_write(struct inode *inode, struct folio *folio, + size_t len) { struct iomap_folio_state *ifs = folio->private; - size_t poff = offset_in_folio(folio, pos); - unsigned int ioend_flags = 0; - int error; - - if (wpc->iomap.type == IOMAP_UNWRITTEN) - ioend_flags |= IOMAP_IOEND_UNWRITTEN; - if (wpc->iomap.flags & IOMAP_F_SHARED) - ioend_flags |= IOMAP_IOEND_SHARED; - if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) - ioend_flags |= IOMAP_IOEND_BOUNDARY; - if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) { -new_ioend: - error = iomap_submit_ioend(wpc, 0); - if (error) - return error; - wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos, - ioend_flags); - } - - if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff)) - goto new_ioend; - - if (ifs) - atomic_add(len, &ifs->write_bytes_pending); - - /* - * Clamp io_offset and io_size to the incore EOF so that ondisk - * file size updates in the ioend completion are byte-accurate. - * This avoids recovering files with zeroed tail regions when - * writeback races with appending writes: - * - * Thread 1: Thread 2: - * ------------ ----------- - * write [A, A+B] - * update inode size to A+B - * submit I/O [A, A+BS] - * write [A+B, A+B+C] - * update inode size to A+B+C - * <I/O completes, updates disk size to min(A+B+C, A+BS)> - * <power failure> - * - * After reboot: - * 1) with A+B+C < A+BS, the file has zero padding in range - * [A+B, A+B+C] - * - * |< Block Size (BS) >| - * |DDDDDDDDDDDD0000000000000| - * ^ ^ ^ - * A A+B A+B+C - * (EOF) - * - * 2) with A+B+C > A+BS, the file has zero padding in range - * [A+B, A+BS] - * - * |< Block Size (BS) >|< Block Size (BS) >| - * |DDDDDDDDDDDD0000000000000|00000000000000000000000000| - * ^ ^ ^ ^ - * A A+B A+BS A+B+C - * (EOF) - * - * D = Valid Data - * 0 = Zero Padding - * - * Note that this defeats the ability to chain the ioends of - * appending writes. - */ - wpc->ioend->io_size += len; - if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos) - wpc->ioend->io_size = end_pos - wpc->ioend->io_offset; + WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs); + WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0); - wbc_account_cgroup_owner(wbc, folio, len); - return 0; + if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending)) + folio_end_writeback(folio); } +EXPORT_SYMBOL_GPL(iomap_finish_folio_write); -static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc, - struct writeback_control *wbc, struct folio *folio, - struct inode *inode, u64 pos, u64 end_pos, - unsigned dirty_len, unsigned *count) +static int iomap_writeback_range(struct iomap_writepage_ctx *wpc, + struct folio *folio, u64 pos, u32 rlen, u64 end_pos, + bool *wb_pending) { - int error; - do { - unsigned map_len; - - error = wpc->ops->map_blocks(wpc, inode, pos, dirty_len); - if (error) - break; - trace_iomap_writepage_map(inode, pos, dirty_len, &wpc->iomap); + ssize_t ret; - map_len = min_t(u64, dirty_len, - wpc->iomap.offset + wpc->iomap.length - pos); - WARN_ON_ONCE(!folio->private && map_len < dirty_len); + ret = wpc->ops->writeback_range(wpc, folio, pos, rlen, end_pos); + if (WARN_ON_ONCE(ret == 0 || ret > rlen)) + return -EIO; + if (ret < 0) + return ret; + rlen -= ret; + pos += ret; - switch (wpc->iomap.type) { - case IOMAP_INLINE: - WARN_ON_ONCE(1); - error = -EIO; - break; - case IOMAP_HOLE: - break; - default: - error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos, - end_pos, map_len); - if (!error) - (*count)++; - break; - } - dirty_len -= map_len; - pos += map_len; - } while (dirty_len && !error); + /* + * Holes are not be written back by ->writeback_range, so track + * if we did handle anything that is not a hole here. + */ + if (wpc->iomap.type != IOMAP_HOLE) + *wb_pending = true; + } while (rlen); - /* - * We cannot cancel the ioend directly here on error. We may have - * already set other pages under writeback and hence we have to run I/O - * completion to mark the error state of the pages under writeback - * appropriately. - * - * Just let the file system know what portion of the folio failed to - * map. - */ - if (error && wpc->ops->discard_folio) - wpc->ops->discard_folio(folio, pos); - return error; + return 0; } /* @@ -1815,7 +1607,7 @@ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc, * If the folio is entirely beyond i_size, return false. If it straddles * i_size, adjust end_pos and zero all data beyond i_size. */ -static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode, +static bool iomap_writeback_handle_eof(struct folio *folio, struct inode *inode, u64 *end_pos) { u64 isize = i_size_read(inode); @@ -1867,15 +1659,14 @@ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode, return true; } -static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, - struct writeback_control *wbc, struct folio *folio) +int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) { struct iomap_folio_state *ifs = folio->private; - struct inode *inode = folio->mapping->host; + struct inode *inode = wpc->inode; u64 pos = folio_pos(folio); u64 end_pos = pos + folio_size(folio); u64 end_aligned = 0; - unsigned count = 0; + bool wb_pending = false; int error = 0; u32 rlen; @@ -1883,12 +1674,10 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, WARN_ON_ONCE(folio_test_dirty(folio)); WARN_ON_ONCE(folio_test_writeback(folio)); - trace_iomap_writepage(inode, pos, folio_size(folio)); + trace_iomap_writeback_folio(inode, pos, folio_size(folio)); - if (!iomap_writepage_handle_eof(folio, inode, &end_pos)) { - folio_unlock(folio); + if (!iomap_writeback_handle_eof(folio, inode, &end_pos)) return 0; - } WARN_ON_ONCE(end_pos <= pos); if (i_blocks_per_folio(inode, folio) > 1) { @@ -1904,7 +1693,7 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, * all blocks. */ WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0); - atomic_inc(&ifs->write_bytes_pending); + iomap_start_folio_write(inode, folio, 1); } /* @@ -1918,14 +1707,14 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, */ end_aligned = round_up(end_pos, i_blocksize(inode)); while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) { - error = iomap_writepage_map_blocks(wpc, wbc, folio, inode, - pos, end_pos, rlen, &count); + error = iomap_writeback_range(wpc, folio, pos, rlen, end_pos, + &wb_pending); if (error) break; pos += rlen; } - if (count) + if (wb_pending) wpc->nr_folios++; /* @@ -1942,23 +1731,22 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, * already at this point. In that case we need to clear the writeback * bit ourselves right after unlocking the page. */ - folio_unlock(folio); if (ifs) { if (atomic_dec_and_test(&ifs->write_bytes_pending)) folio_end_writeback(folio); } else { - if (!count) + if (!wb_pending) folio_end_writeback(folio); } mapping_set_error(inode->i_mapping, error); return error; } +EXPORT_SYMBOL_GPL(iomap_writeback_folio); int -iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, - struct iomap_writepage_ctx *wpc, - const struct iomap_writeback_ops *ops) +iomap_writepages(struct iomap_writepage_ctx *wpc) { + struct address_space *mapping = wpc->inode->i_mapping; struct folio *folio = NULL; int error; @@ -1970,9 +1758,22 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, PF_MEMALLOC)) return -EIO; - wpc->ops = ops; - while ((folio = writeback_iter(mapping, wbc, folio, &error))) - error = iomap_writepage_map(wpc, wbc, folio); - return iomap_submit_ioend(wpc, error); + while ((folio = writeback_iter(mapping, wpc->wbc, folio, &error))) { + error = iomap_writeback_folio(wpc, folio); + folio_unlock(folio); + } + + /* + * If @error is non-zero, it means that we have a situation where some + * part of the submission process has failed after we've marked pages + * for writeback. + * + * We cannot cancel the writeback directly in that case, so always call + * ->writeback_submit to run the I/O completion handler to clear the + * writeback bit and let the file system proess the errors. + */ + if (wpc->wb_ctx) + return wpc->ops->writeback_submit(wpc, error); + return error; } EXPORT_SYMBOL_GPL(iomap_writepages); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 844261a31156..6f25d4cfea9f 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -3,14 +3,9 @@ * Copyright (C) 2010 Red Hat, Inc. * Copyright (c) 2016-2025 Christoph Hellwig. */ -#include <linux/module.h> -#include <linux/compiler.h> -#include <linux/fs.h> #include <linux/fscrypt.h> #include <linux/pagemap.h> #include <linux/iomap.h> -#include <linux/backing-dev.h> -#include <linux/uio.h> #include <linux/task_io_accounting_ops.h> #include "internal.h" #include "trace.h" diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index 80675c42e94e..d11dadff8286 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -2,9 +2,6 @@ /* * Copyright (c) 2016-2021 Christoph Hellwig. */ -#include <linux/module.h> -#include <linux/compiler.h> -#include <linux/fs.h> #include <linux/iomap.h> #include <linux/fiemap.h> #include <linux/pagemap.h> diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h index f6992a3bf66a..d05cb3aed96e 100644 --- a/fs/iomap/internal.h +++ b/fs/iomap/internal.h @@ -4,7 +4,6 @@ #define IOEND_BATCH_SIZE 4096 -u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend); u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend); #endif /* _IOMAP_INTERNAL_H */ diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index 18894ebba6db..b49fa75eab26 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -1,10 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2024-2025 Christoph Hellwig. + * Copyright (c) 2016-2025 Christoph Hellwig. */ #include <linux/iomap.h> #include <linux/list_sort.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> #include "internal.h" +#include "trace.h" struct bio_set iomap_ioend_bioset; EXPORT_SYMBOL_GPL(iomap_ioend_bioset); @@ -28,6 +31,221 @@ struct iomap_ioend *iomap_init_ioend(struct inode *inode, } EXPORT_SYMBOL_GPL(iomap_init_ioend); +/* + * We're now finished for good with this ioend structure. Update the folio + * state, release holds on bios, and finally free up memory. Do not use the + * ioend after this. + */ +static u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend) +{ + struct inode *inode = ioend->io_inode; + struct bio *bio = &ioend->io_bio; + struct folio_iter fi; + u32 folio_count = 0; + + if (ioend->io_error) { + mapping_set_error(inode->i_mapping, ioend->io_error); + if (!bio_flagged(bio, BIO_QUIET)) { + pr_err_ratelimited( +"%s: writeback error on inode %lu, offset %lld, sector %llu", + inode->i_sb->s_id, inode->i_ino, + ioend->io_offset, ioend->io_sector); + } + } + + /* walk all folios in bio, ending page IO on them */ + bio_for_each_folio_all(fi, bio) { + iomap_finish_folio_write(inode, fi.folio, fi.length); + folio_count++; + } + + bio_put(bio); /* frees the ioend */ + return folio_count; +} + +static void ioend_writeback_end_bio(struct bio *bio) +{ + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); + + ioend->io_error = blk_status_to_errno(bio->bi_status); + iomap_finish_ioend_buffered(ioend); +} + +/* + * We cannot cancel the ioend directly in case of an error, so call the bio end + * I/O handler with the error status here to run the normal I/O completion + * handler. + */ +int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error) +{ + struct iomap_ioend *ioend = wpc->wb_ctx; + + if (!ioend->io_bio.bi_end_io) + ioend->io_bio.bi_end_io = ioend_writeback_end_bio; + + if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE)) + error = -EIO; + + if (error) { + ioend->io_bio.bi_status = errno_to_blk_status(error); + bio_endio(&ioend->io_bio); + return error; + } + + submit_bio(&ioend->io_bio); + return 0; +} +EXPORT_SYMBOL_GPL(iomap_ioend_writeback_submit); + +static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, + loff_t pos, u16 ioend_flags) +{ + struct bio *bio; + + bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS, + REQ_OP_WRITE | wbc_to_write_flags(wpc->wbc), + GFP_NOFS, &iomap_ioend_bioset); + bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos); + bio->bi_write_hint = wpc->inode->i_write_hint; + wbc_init_bio(wpc->wbc, bio); + wpc->nr_folios = 0; + return iomap_init_ioend(wpc->inode, bio, pos, ioend_flags); +} + +static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos, + u16 ioend_flags) +{ + struct iomap_ioend *ioend = wpc->wb_ctx; + + if (ioend_flags & IOMAP_IOEND_BOUNDARY) + return false; + if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) != + (ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) + return false; + if (pos != ioend->io_offset + ioend->io_size) + return false; + if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) && + iomap_sector(&wpc->iomap, pos) != bio_end_sector(&ioend->io_bio)) + return false; + /* + * Limit ioend bio chain lengths to minimise IO completion latency. This + * also prevents long tight loops ending page writeback on all the + * folios in the ioend. + */ + if (wpc->nr_folios >= IOEND_BATCH_SIZE) + return false; + return true; +} + +/* + * Test to see if we have an existing ioend structure that we could append to + * first; otherwise finish off the current ioend and start another. + * + * If a new ioend is created and cached, the old ioend is submitted to the block + * layer instantly. Batching optimisations are provided by higher level block + * plugging. + * + * At the end of a writeback pass, there will be a cached ioend remaining on the + * writepage context that the caller will need to submit. + */ +ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio, + loff_t pos, loff_t end_pos, unsigned int dirty_len) +{ + struct iomap_ioend *ioend = wpc->wb_ctx; + size_t poff = offset_in_folio(folio, pos); + unsigned int ioend_flags = 0; + unsigned int map_len = min_t(u64, dirty_len, + wpc->iomap.offset + wpc->iomap.length - pos); + int error; + + trace_iomap_add_to_ioend(wpc->inode, pos, dirty_len, &wpc->iomap); + + WARN_ON_ONCE(!folio->private && map_len < dirty_len); + + switch (wpc->iomap.type) { + case IOMAP_INLINE: + WARN_ON_ONCE(1); + return -EIO; + case IOMAP_HOLE: + return map_len; + default: + break; + } + + if (wpc->iomap.type == IOMAP_UNWRITTEN) + ioend_flags |= IOMAP_IOEND_UNWRITTEN; + if (wpc->iomap.flags & IOMAP_F_SHARED) + ioend_flags |= IOMAP_IOEND_SHARED; + if (folio_test_dropbehind(folio)) + ioend_flags |= IOMAP_IOEND_DONTCACHE; + if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) + ioend_flags |= IOMAP_IOEND_BOUNDARY; + + if (!ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) { +new_ioend: + if (ioend) { + error = wpc->ops->writeback_submit(wpc, 0); + if (error) + return error; + } + wpc->wb_ctx = ioend = iomap_alloc_ioend(wpc, pos, ioend_flags); + } + + if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff)) + goto new_ioend; + + iomap_start_folio_write(wpc->inode, folio, map_len); + + /* + * Clamp io_offset and io_size to the incore EOF so that ondisk + * file size updates in the ioend completion are byte-accurate. + * This avoids recovering files with zeroed tail regions when + * writeback races with appending writes: + * + * Thread 1: Thread 2: + * ------------ ----------- + * write [A, A+B] + * update inode size to A+B + * submit I/O [A, A+BS] + * write [A+B, A+B+C] + * update inode size to A+B+C + * <I/O completes, updates disk size to min(A+B+C, A+BS)> + * <power failure> + * + * After reboot: + * 1) with A+B+C < A+BS, the file has zero padding in range + * [A+B, A+B+C] + * + * |< Block Size (BS) >| + * |DDDDDDDDDDDD0000000000000| + * ^ ^ ^ + * A A+B A+B+C + * (EOF) + * + * 2) with A+B+C > A+BS, the file has zero padding in range + * [A+B, A+BS] + * + * |< Block Size (BS) >|< Block Size (BS) >| + * |DDDDDDDDDDDD0000000000000|00000000000000000000000000| + * ^ ^ ^ ^ + * A A+B A+BS A+B+C + * (EOF) + * + * D = Valid Data + * 0 = Zero Padding + * + * Note that this defeats the ability to chain the ioends of + * appending writes. + */ + ioend->io_size += map_len; + if (ioend->io_offset + ioend->io_size > end_pos) + ioend->io_size = end_pos - ioend->io_offset; + + wbc_account_cgroup_owner(wpc->wbc, folio, map_len); + return map_len; +} +EXPORT_SYMBOL_GPL(iomap_add_to_ioend); + static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) { if (ioend->io_parent) { diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index 6ffc6a7b9ba5..cef77ca0c20b 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -3,7 +3,6 @@ * Copyright (C) 2010 Red Hat, Inc. * Copyright (c) 2016-2021 Christoph Hellwig. */ -#include <linux/fs.h> #include <linux/iomap.h> #include "trace.h" diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c index 04d7919636c1..56db2dd4b10d 100644 --- a/fs/iomap/seek.c +++ b/fs/iomap/seek.c @@ -3,12 +3,8 @@ * Copyright (C) 2017 Red Hat, Inc. * Copyright (c) 2018-2021 Christoph Hellwig. */ -#include <linux/module.h> -#include <linux/compiler.h> -#include <linux/fs.h> #include <linux/iomap.h> #include <linux/pagemap.h> -#include <linux/pagevec.h> static int iomap_seek_hole_iter(struct iomap_iter *iter, loff_t *hole_pos) diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index c1a762c10ce4..0db77c449467 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -3,9 +3,6 @@ * Copyright (C) 2018 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> */ -#include <linux/module.h> -#include <linux/compiler.h> -#include <linux/fs.h> #include <linux/iomap.h> #include <linux/swap.h> diff --git a/fs/iomap/trace.c b/fs/iomap/trace.c index 728d5443daf5..da217246b1a9 100644 --- a/fs/iomap/trace.c +++ b/fs/iomap/trace.c @@ -3,7 +3,6 @@ * Copyright (c) 2019 Christoph Hellwig */ #include <linux/iomap.h> -#include <linux/uio.h> /* * We include this last to have the helpers above available for the trace diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 455cc6f90be0..6ad66e6ba653 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -79,7 +79,7 @@ DECLARE_EVENT_CLASS(iomap_range_class, DEFINE_EVENT(iomap_range_class, name, \ TP_PROTO(struct inode *inode, loff_t off, u64 len),\ TP_ARGS(inode, off, len)) -DEFINE_RANGE_EVENT(iomap_writepage); +DEFINE_RANGE_EVENT(iomap_writeback_folio); DEFINE_RANGE_EVENT(iomap_release_folio); DEFINE_RANGE_EVENT(iomap_invalidate_folio); DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail); @@ -169,7 +169,7 @@ DEFINE_EVENT(iomap_class, name, \ DEFINE_IOMAP_EVENT(iomap_iter_dstmap); DEFINE_IOMAP_EVENT(iomap_iter_srcmap); -TRACE_EVENT(iomap_writepage_map, +TRACE_EVENT(iomap_add_to_ioend, TP_PROTO(struct inode *inode, u64 pos, unsigned int dirty_len, struct iomap *iomap), TP_ARGS(inode, pos, dirty_len, iomap), diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index d5da9817df9b..6f0e6b19383c 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -939,7 +939,7 @@ root_found: sbi->s_check = opt->check; if (table) - s->s_d_op = &isofs_dentry_ops[table - 1]; + set_default_d_op(s, &isofs_dentry_ops[table - 1]); /* get the root dentry */ s->s_root = d_make_root(inode); @@ -1440,9 +1440,16 @@ static int isofs_read_inode(struct inode *inode, int relocated) inode->i_op = &page_symlink_inode_operations; inode_nohighmem(inode); inode->i_data.a_ops = &isofs_symlink_aops; - } else + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { /* XXX - parse_rock_ridge_inode() had already set i_rdev. */ init_special_inode(inode, inode->i_mode, inode->i_rdev); + } else { + printk(KERN_DEBUG "ISOFS: Invalid file type 0%04o for inode %lu.\n", + inode->i_mode, inode->i_ino); + ret = -EIO; + goto fail; + } ret = 0; out: diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 6d5e76848733..d480b94117cd 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -134,7 +134,7 @@ static __be32 jbd2_superblock_csum(journal_superblock_t *sb) static void commit_timeout(struct timer_list *t) { - journal_t *journal = from_timer(journal, t, j_commit_timer); + journal_t *journal = timer_container_of(journal, t, j_commit_timer); wake_up_process(journal->j_task); } diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index ef3a1e1b6cb0..fda9f4d6093f 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -425,7 +425,9 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb .totlen = cpu_to_je32(c->cleanmarker_size) }; - jffs2_prealloc_raw_node_refs(c, jeb, 1); + ret = jffs2_prealloc_raw_node_refs(c, jeb, 1); + if (ret) + goto filebad; marker.hdr_crc = cpu_to_je32(crc32(0, &marker, sizeof(struct jffs2_unknown_node)-4)); diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 13c18ccc13b0..dd3dff95cb24 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -21,12 +21,14 @@ #include <linux/jffs2.h> #include "nodelist.h" -static int jffs2_write_end(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata); -static int jffs2_write_begin(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata); +static int jffs2_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata); +static int jffs2_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata); static int jffs2_read_folio(struct file *filp, struct folio *folio); int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync) @@ -54,7 +56,7 @@ const struct file_operations jffs2_file_operations = .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .unlocked_ioctl=jffs2_ioctl, - .mmap = generic_file_readonly_mmap, + .mmap_prepare = generic_file_readonly_mmap_prepare, .fsync = jffs2_fsync, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, @@ -121,9 +123,10 @@ static int jffs2_read_folio(struct file *file, struct folio *folio) return ret; } -static int jffs2_write_begin(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int jffs2_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { struct folio *folio; struct inode *inode = mapping->host; @@ -235,9 +238,10 @@ out_err: return ret; } -static int jffs2_write_end(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int jffs2_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { /* Actually commit the write from the page cache page we're looking at. * For now, we write the full page out each time. It sucks, but it's simple diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 29671e33a171..62879c218d4b 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -256,7 +256,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) jffs2_dbg(1, "%s(): Skipping %d bytes in nextblock to ensure page alignment\n", __func__, skip); - jffs2_prealloc_raw_node_refs(c, c->nextblock, 1); + ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1); + if (ret) + goto out; jffs2_scan_dirty_space(c, c->nextblock, skip); } #endif diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c index 4fe64519870f..d83372d3e1a0 100644 --- a/fs/jffs2/summary.c +++ b/fs/jffs2/summary.c @@ -858,7 +858,10 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c) spin_unlock(&c->erase_completion_lock); jeb = c->nextblock; - jffs2_prealloc_raw_node_refs(c, jeb, 1); + ret = jffs2_prealloc_raw_node_refs(c, jeb, 1); + + if (ret) + goto out; if (!c->summary->sum_num || !c->summary->sum_list_head) { JFFS2_WARNING("Empty summary info!!!\n"); @@ -872,6 +875,8 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c) datasize += padsize; ret = jffs2_sum_write_data(c, jeb, infosize, datasize, padsize); + +out: spin_lock(&c->erase_completion_lock); return ret; } diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 01b6912e60f8..2a4a288b821c 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -44,6 +44,9 @@ static int jfs_open(struct inode *inode, struct file *file) { int rc; + if (S_ISREG(inode->i_mode) && inode->i_size < 0) + return -EIO; + if ((rc = dquot_file_open(inode, file))) return rc; @@ -143,7 +146,7 @@ const struct file_operations jfs_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, - .mmap = generic_file_mmap, + .mmap_prepare = generic_file_mmap_prepare, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .fsync = jfs_fsync, diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 60fc92dee24d..fcedeb514e14 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -145,9 +145,9 @@ void jfs_evict_inode(struct inode *inode) if (!inode->i_nlink && !is_bad_inode(inode)) { dquot_initialize(inode); + truncate_inode_pages_final(&inode->i_data); if (JFS_IP(inode)->fileset == FILESYSTEM_I) { struct inode *ipimap = JFS_SBI(inode->i_sb)->ipimap; - truncate_inode_pages_final(&inode->i_data); if (test_cflag(COMMIT_Freewmap, inode)) jfs_free_zero_link(inode); @@ -290,9 +290,10 @@ static void jfs_write_failed(struct address_space *mapping, loff_t to) } } -static int jfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int jfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; @@ -303,13 +304,14 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping, return ret; } -static int jfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, struct folio *folio, - void *fsdata) +static int jfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { int ret; - ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); + ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); if (ret < len) jfs_write_failed(mapping, pos + len); return ret; diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index f7bd7e8f5be4..563f148be8af 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -57,7 +57,7 @@ static long jfs_map_ext2(unsigned long flags, int from) return mapped; } -int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int jfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct jfs_inode_info *jfs_inode = JFS_IP(d_inode(dentry)); unsigned int flags = jfs_inode->mode2 & JFS_FL_USER_VISIBLE; @@ -71,7 +71,7 @@ int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) } int jfs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct jfs_inode_info *jfs_inode = JFS_IP(inode); diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 35e063c9f3a4..cdfa699cd7c8 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -1389,6 +1389,12 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results) (1 << (L2LPERCTL - (bmp->db_agheight << 1))) / bmp->db_agwidth; ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1)); + if (ti < 0 || ti >= le32_to_cpu(dcp->nleafs)) { + jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n"); + release_metapage(mp); + return -EIO; + } + /* dmap control page trees fan-out by 4 and a single allocation * group may be described by 1 or 2 subtrees within the ag level * dmap control page, depending upon the ag size. examine the ag's @@ -1809,8 +1815,10 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results) return -EIO; dp = (struct dmap *) mp->data; - if (dp->tree.budmin < 0) + if (dp->tree.budmin < 0) { + release_metapage(mp); return -EIO; + } /* try to allocate the blocks. */ diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index ea80661597ac..2c6c81c8cb9f 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -9,9 +9,9 @@ struct fid; extern struct inode *ialloc(struct inode *, umode_t); extern int jfs_fsync(struct file *, loff_t, loff_t, int); -extern int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +extern int jfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa); extern int jfs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); extern long jfs_ioctl(struct file *, unsigned int, unsigned long); extern struct inode *jfs_iget(struct super_block *, unsigned long); extern int jfs_commit_inode(struct inode *, int); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index df575a873ec6..b98cf3bb6c1f 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -15,6 +15,7 @@ #include <linux/mempool.h> #include <linux/seq_file.h> #include <linux/writeback.h> +#include <linux/migrate.h> #include "jfs_incore.h" #include "jfs_superblock.h" #include "jfs_filsys.h" @@ -151,7 +152,59 @@ static inline void dec_io(struct folio *folio, blk_status_t status, handler(folio, anchor->status); } +#ifdef CONFIG_MIGRATION +static int __metapage_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, + enum migrate_mode mode) +{ + struct meta_anchor *src_anchor = src->private; + struct metapage *mps[MPS_PER_PAGE] = {0}; + struct metapage *mp; + int i, rc; + + for (i = 0; i < MPS_PER_PAGE; i++) { + mp = src_anchor->mp[i]; + if (mp && metapage_locked(mp)) + return -EAGAIN; + } + + rc = filemap_migrate_folio(mapping, dst, src, mode); + if (rc != MIGRATEPAGE_SUCCESS) + return rc; + + for (i = 0; i < MPS_PER_PAGE; i++) { + mp = src_anchor->mp[i]; + if (!mp) + continue; + if (unlikely(insert_metapage(dst, mp))) { + /* If error, roll-back previosly inserted pages */ + for (int j = 0 ; j < i; j++) { + if (mps[j]) + remove_metapage(dst, mps[j]); + } + return -EAGAIN; + } + mps[i] = mp; + } + + /* Update the metapage and remove it from src */ + for (i = 0; i < MPS_PER_PAGE; i++) { + mp = mps[i]; + if (mp) { + int page_offset = mp->data - folio_address(src); + + mp->data = folio_address(dst) + page_offset; + mp->folio = dst; + remove_metapage(src, mp); + } + } + + return MIGRATEPAGE_SUCCESS; +} +#endif /* CONFIG_MIGRATION */ + #else + static inline struct metapage *folio_to_mp(struct folio *folio, int offset) { return folio->private; @@ -175,6 +228,35 @@ static inline void remove_metapage(struct folio *folio, struct metapage *mp) #define inc_io(folio) do {} while(0) #define dec_io(folio, status, handler) handler(folio, status) +#ifdef CONFIG_MIGRATION +static int __metapage_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, + enum migrate_mode mode) +{ + struct metapage *mp; + int page_offset; + int rc; + + mp = folio_to_mp(src, 0); + if (metapage_locked(mp)) + return -EAGAIN; + + rc = filemap_migrate_folio(mapping, dst, src, mode); + if (rc != MIGRATEPAGE_SUCCESS) + return rc; + + if (unlikely(insert_metapage(dst, mp))) + return -EAGAIN; + + page_offset = mp->data - folio_address(src); + mp->data = folio_address(dst) + page_offset; + mp->folio = dst; + remove_metapage(src, mp); + + return MIGRATEPAGE_SUCCESS; +} +#endif /* CONFIG_MIGRATION */ + #endif static inline struct metapage *alloc_metapage(gfp_t gfp_mask) @@ -339,7 +421,7 @@ static void metapage_write_end_io(struct bio *bio) } static int metapage_write_folio(struct folio *folio, - struct writeback_control *wbc, void *unused) + struct writeback_control *wbc) { struct bio *bio = NULL; int block_offset; /* block offset of mp within page */ @@ -468,10 +550,12 @@ static int metapage_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct blk_plug plug; + struct folio *folio = NULL; int err; blk_start_plug(&plug); - err = write_cache_pages(mapping, wbc, metapage_write_folio, NULL); + while ((folio = writeback_iter(mapping, wbc, folio, &err))) + err = metapage_write_folio(folio, wbc); blk_finish_plug(&plug); return err; @@ -554,6 +638,29 @@ static bool metapage_release_folio(struct folio *folio, gfp_t gfp_mask) return ret; } +#ifdef CONFIG_MIGRATION +/* + * metapage_migrate_folio - Migration function for JFS metapages + */ +static int metapage_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, + enum migrate_mode mode) +{ + int expected_count; + + if (!src->private) + return filemap_migrate_folio(mapping, dst, src, mode); + + /* Check whether page does not have extra refs before we do more work */ + expected_count = folio_expected_ref_count(src) + 1; + if (folio_ref_count(src) != expected_count) + return -EAGAIN; + return __metapage_migrate_folio(mapping, dst, src, mode); +} +#else +#define metapage_migrate_folio NULL +#endif /* CONFIG_MIGRATION */ + static void metapage_invalidate_folio(struct folio *folio, size_t offset, size_t length) { @@ -570,6 +677,7 @@ const struct address_space_operations jfs_metapage_aops = { .release_folio = metapage_release_folio, .invalidate_folio = metapage_invalidate_folio, .dirty_folio = filemap_dirty_folio, + .migrate_folio = metapage_migrate_folio, }; struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, @@ -707,7 +815,7 @@ static int metapage_write_one(struct folio *folio) if (folio_clear_dirty_for_io(folio)) { folio_get(folio); - ret = metapage_write_folio(folio, &wbc, NULL); + ret = metapage_write_folio(folio, &wbc); if (ret == 0) folio_wait_writeback(folio); folio_put(folio); diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index 5ee618d17e77..28c3cf960c6f 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -49,26 +49,6 @@ #define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot) -/* get page buffer for specified block address */ -/* ToDo: Replace this ugly macro with a function */ -#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC) \ -do { \ - BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot); \ - if (!(RC)) { \ - if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) || \ - (le16_to_cpu((P)->header.nextindex) > \ - le16_to_cpu((P)->header.maxentry)) || \ - (le16_to_cpu((P)->header.maxentry) > \ - (((BN) == 0) ? XTROOTMAXSLOT : PSIZE >> L2XTSLOTSIZE))) { \ - jfs_error((IP)->i_sb, \ - "XT_GETPAGE: xtree page corrupt\n"); \ - BT_PUTPAGE(MP); \ - MP = NULL; \ - RC = -EIO; \ - } \ - } \ -} while (0) - /* for consistency */ #define XT_PUTPAGE(MP) BT_PUTPAGE(MP) @@ -115,6 +95,42 @@ static int xtSplitRoot(tid_t tid, struct inode *ip, struct xtsplit * split, struct metapage ** rmpp); /* + * xt_getpage() + * + * function: get the page buffer for a specified block address. + * + * parameters: + * ip - pointer to the inode + * bn - block number (s64) of the xtree page to be retrieved; + * mp - pointer to a metapage pointer where the page buffer is returned; + * + * returns: + * A pointer to the xtree page (xtpage_t) on success, -EIO on error. + */ + +static inline xtpage_t *xt_getpage(struct inode *ip, s64 bn, struct metapage **mp) +{ + xtpage_t *p; + int rc; + + BT_GETPAGE(ip, bn, *mp, xtpage_t, PSIZE, p, rc, i_xtroot); + + if (rc) + return ERR_PTR(rc); + if ((le16_to_cpu(p->header.nextindex) < XTENTRYSTART) || + (le16_to_cpu(p->header.nextindex) > + le16_to_cpu(p->header.maxentry)) || + (le16_to_cpu(p->header.maxentry) > + ((bn == 0) ? XTROOTMAXSLOT : PSIZE >> L2XTSLOTSIZE))) { + jfs_error(ip->i_sb, "xt_getpage: xtree page corrupt\n"); + BT_PUTPAGE(*mp); + *mp = NULL; + return ERR_PTR(-EIO); + } + return p; +} + +/* * xtLookup() * * function: map a single page into a physical extent; @@ -216,7 +232,6 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, int *cmpp, struct btstack * btstack, int flag) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); - int rc = 0; int cmp = 1; /* init for empty page */ s64 bn; /* block number */ struct metapage *mp; /* page buffer */ @@ -252,9 +267,9 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, */ for (bn = 0;;) { /* get/pin the page to search */ - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); /* try sequential access heuristics with the previous * access entry in target leaf page: @@ -807,10 +822,10 @@ xtSplitUp(tid_t tid, * insert router entry in parent for new right child page <rp> */ /* get/pin the parent page <sp> */ - XT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc); - if (rc) { + sp = xt_getpage(ip, parent->bn, &smp); + if (IS_ERR(sp)) { XT_PUTPAGE(rcmp); - return rc; + return PTR_ERR(sp); } /* @@ -1062,10 +1077,10 @@ xtSplitPage(tid_t tid, struct inode *ip, * update previous pointer of old next/right page of <sp> */ if (nextbn != 0) { - XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); - if (rc) { + p = xt_getpage(ip, nextbn, &mp); + if (IS_ERR(p)) { XT_PUTPAGE(rmp); - goto clean_up; + return PTR_ERR(p); } BT_MARK_DIRTY(mp, ip); @@ -1417,9 +1432,9 @@ int xtExtend(tid_t tid, /* transaction id */ return rc; /* get back old page */ - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); /* * if leaf root has been split, original root has been * copied to new child page, i.e., original entry now @@ -1433,9 +1448,9 @@ int xtExtend(tid_t tid, /* transaction id */ XT_PUTPAGE(mp); /* get new child page */ - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); BT_MARK_DIRTY(mp, ip); if (!test_cflag(COMMIT_Nolink, ip)) { @@ -1711,9 +1726,9 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) return rc; /* get back old page */ - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); /* * if leaf root has been split, original root has been * copied to new child page, i.e., original entry now @@ -1727,9 +1742,9 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) XT_PUTPAGE(mp); /* get new child page */ - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); BT_MARK_DIRTY(mp, ip); if (!test_cflag(COMMIT_Nolink, ip)) { @@ -1788,9 +1803,9 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) XT_PUTPAGE(mp); /* get new right page */ - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); BT_MARK_DIRTY(mp, ip); if (!test_cflag(COMMIT_Nolink, ip)) { @@ -1864,9 +1879,9 @@ printf("xtUpdate.updateLeft.split p:0x%p\n", p); return rc; /* get back old page */ - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); /* * if leaf root has been split, original root has been @@ -1881,9 +1896,9 @@ printf("xtUpdate.updateLeft.split p:0x%p\n", p); XT_PUTPAGE(mp); /* get new child page */ - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); BT_MARK_DIRTY(mp, ip); if (!test_cflag(COMMIT_Nolink, ip)) { @@ -2187,7 +2202,6 @@ void xtInitRoot(tid_t tid, struct inode *ip) */ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) { - int rc = 0; s64 teof; struct metapage *mp; xtpage_t *p; @@ -2268,9 +2282,9 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) * first access of each page: */ getPage: - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); /* process entries backward from last index */ index = le16_to_cpu(p->header.nextindex) - 1; @@ -2506,9 +2520,9 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) /* get back the parent page */ bn = parent->bn; - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); index = parent->index; @@ -2791,9 +2805,9 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) * first access of each page: */ getPage: - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); /* process entries backward from last index */ index = le16_to_cpu(p->header.nextindex) - 1; @@ -2836,9 +2850,9 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) /* get back the parent page */ bn = parent->bn; - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; + p = xt_getpage(ip, bn, &mp); + if (IS_ERR(p)) + return PTR_ERR(p); index = parent->index; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 10368c188c5e..3cfb86c5a36e 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -542,7 +542,7 @@ static int jfs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_magic = JFS_SUPER_MAGIC; if (sbi->mntflag & JFS_OS2) - sb->s_d_op = &jfs_ci_dentry_operations; + set_default_d_op(sb, &jfs_ci_dentry_operations); inode = jfs_iget(sb, ROOT_I); if (IS_ERR(inode)) { diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index b83054da68b3..3c293a5a21b1 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -24,45 +24,46 @@ static const struct inode_operations kernfs_iops = { .listxattr = kernfs_iop_listxattr, }; -static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc) +static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, bool alloc) { - static DEFINE_MUTEX(iattr_mutex); - struct kernfs_iattrs *ret; + struct kernfs_iattrs *ret __free(kfree) = NULL; + struct kernfs_iattrs *attr; - mutex_lock(&iattr_mutex); + attr = READ_ONCE(kn->iattr); + if (attr || !alloc) + return attr; - if (kn->iattr || !alloc) - goto out_unlock; - - kn->iattr = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL); - if (!kn->iattr) - goto out_unlock; + ret = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL); + if (!ret) + return NULL; /* assign default attributes */ - kn->iattr->ia_uid = GLOBAL_ROOT_UID; - kn->iattr->ia_gid = GLOBAL_ROOT_GID; - - ktime_get_real_ts64(&kn->iattr->ia_atime); - kn->iattr->ia_mtime = kn->iattr->ia_atime; - kn->iattr->ia_ctime = kn->iattr->ia_atime; - - simple_xattrs_init(&kn->iattr->xattrs); - atomic_set(&kn->iattr->nr_user_xattrs, 0); - atomic_set(&kn->iattr->user_xattr_size, 0); -out_unlock: - ret = kn->iattr; - mutex_unlock(&iattr_mutex); - return ret; + ret->ia_uid = GLOBAL_ROOT_UID; + ret->ia_gid = GLOBAL_ROOT_GID; + + ktime_get_real_ts64(&ret->ia_atime); + ret->ia_mtime = ret->ia_atime; + ret->ia_ctime = ret->ia_atime; + + simple_xattrs_init(&ret->xattrs); + atomic_set(&ret->nr_user_xattrs, 0); + atomic_set(&ret->user_xattr_size, 0); + + /* If someone raced us, recognize it. */ + if (!try_cmpxchg(&kn->iattr, &attr, ret)) + return READ_ONCE(kn->iattr); + + return no_free_ptr(ret); } static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) { - return __kernfs_iattrs(kn, 1); + return __kernfs_iattrs(kn, true); } static struct kernfs_iattrs *kernfs_iattrs_noalloc(struct kernfs_node *kn) { - return __kernfs_iattrs(kn, 0); + return __kernfs_iattrs(kn, false); } int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) @@ -141,9 +142,9 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_iattrs *attrs; - attrs = kernfs_iattrs(kn); + attrs = kernfs_iattrs_noalloc(kn); if (!attrs) - return -ENOMEM; + return -ENODATA; return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size); } @@ -166,9 +167,10 @@ static inline void set_inode_attr(struct inode *inode, static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) { - struct kernfs_iattrs *attrs = kn->iattr; + struct kernfs_iattrs *attrs; inode->i_mode = kn->mode; + attrs = kernfs_iattrs_noalloc(kn); if (attrs) /* * kernfs_node has non-default attributes get them from @@ -306,7 +308,9 @@ int kernfs_xattr_set(struct kernfs_node *kn, const char *name, const void *value, size_t size, int flags) { struct simple_xattr *old_xattr; - struct kernfs_iattrs *attrs = kernfs_iattrs(kn); + struct kernfs_iattrs *attrs; + + attrs = kernfs_iattrs(kn); if (!attrs) return -ENOMEM; @@ -345,8 +349,9 @@ static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn, struct simple_xattrs *xattrs, const void *value, size_t size, int flags) { - atomic_t *sz = &kn->iattr->user_xattr_size; - atomic_t *nr = &kn->iattr->nr_user_xattrs; + struct kernfs_iattrs *attr = kernfs_iattrs_noalloc(kn); + atomic_t *sz = &attr->user_xattr_size; + atomic_t *nr = &attr->nr_user_xattrs; struct simple_xattr *old_xattr; int ret; @@ -384,8 +389,9 @@ static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn, struct simple_xattrs *xattrs, const void *value, size_t size, int flags) { - atomic_t *sz = &kn->iattr->user_xattr_size; - atomic_t *nr = &kn->iattr->nr_user_xattrs; + struct kernfs_iattrs *attr = kernfs_iattrs_noalloc(kn); + atomic_t *sz = &attr->user_xattr_size; + atomic_t *nr = &attr->nr_user_xattrs; struct simple_xattr *old_xattr; old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags); diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index c1719b5778a1..e384a69fbece 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -318,7 +318,7 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k return -ENOMEM; } sb->s_root = root; - sb->s_d_op = &kernfs_dops; + set_default_d_op(sb, &kernfs_dops); return 0; } diff --git a/fs/libfs.c b/fs/libfs.c index 9ea0ecc325a8..ce8c496a6940 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -62,11 +62,6 @@ int always_delete_dentry(const struct dentry *dentry) } EXPORT_SYMBOL(always_delete_dentry); -const struct dentry_operations simple_dentry_operations = { - .d_delete = always_delete_dentry, -}; -EXPORT_SYMBOL(simple_dentry_operations); - /* * Lookup the data. This is trivial - if the dentry didn't already * exist, we know it is negative. Set d_op to delete negative dentries. @@ -75,9 +70,11 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned { if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); - if (!dentry->d_sb->s_d_op) - d_set_d_op(dentry, &simple_dentry_operations); - + if (!dentry->d_op && !(dentry->d_flags & DCACHE_DONTCACHE)) { + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_DONTCACHE; + spin_unlock(&dentry->d_lock); + } if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) return NULL; @@ -605,15 +602,16 @@ struct dentry *find_next_child(struct dentry *parent, struct dentry *prev) } EXPORT_SYMBOL(find_next_child); -void simple_recursive_removal(struct dentry *dentry, - void (*callback)(struct dentry *)) +static void __simple_recursive_removal(struct dentry *dentry, + void (*callback)(struct dentry *), + bool locked) { struct dentry *this = dget(dentry); while (true) { struct dentry *victim = NULL, *child; struct inode *inode = this->d_inode; - inode_lock(inode); + inode_lock_nested(inode, I_MUTEX_CHILD); if (d_is_dir(this)) inode->i_flags |= S_DEAD; while ((child = find_next_child(this, victim)) == NULL) { @@ -625,15 +623,13 @@ void simple_recursive_removal(struct dentry *dentry, victim = this; this = this->d_parent; inode = this->d_inode; - inode_lock(inode); + if (!locked || victim != dentry) + inode_lock_nested(inode, I_MUTEX_CHILD); if (simple_positive(victim)) { d_invalidate(victim); // avoid lost mounts - if (d_is_dir(victim)) - fsnotify_rmdir(inode, victim); - else - fsnotify_unlink(inode, victim); if (callback) callback(victim); + fsnotify_delete(inode, d_inode(victim), victim); dput(victim); // unpin it } if (victim == dentry) { @@ -641,7 +637,8 @@ void simple_recursive_removal(struct dentry *dentry, inode_set_ctime_current(inode)); if (d_is_dir(dentry)) drop_nlink(inode); - inode_unlock(inode); + if (!locked) + inode_unlock(inode); dput(dentry); return; } @@ -650,8 +647,22 @@ void simple_recursive_removal(struct dentry *dentry, this = child; } } + +void simple_recursive_removal(struct dentry *dentry, + void (*callback)(struct dentry *)) +{ + return __simple_recursive_removal(dentry, callback, false); +} EXPORT_SYMBOL(simple_recursive_removal); +/* caller holds parent directory with I_MUTEX_PARENT */ +void locked_recursive_removal(struct dentry *dentry, + void (*callback)(struct dentry *)) +{ + return __simple_recursive_removal(dentry, callback, true); +} +EXPORT_SYMBOL(locked_recursive_removal); + static const struct super_operations simple_super_operations = { .statfs = simple_statfs, }; @@ -684,7 +695,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc) s->s_root = d_make_root(root); if (!s->s_root) return -ENOMEM; - s->s_d_op = ctx->dops; + set_default_d_op(s, ctx->dops); return 0; } @@ -910,7 +921,7 @@ static int simple_read_folio(struct file *file, struct folio *folio) return 0; } -int simple_write_begin(struct file *file, struct address_space *mapping, +int simple_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -935,7 +946,7 @@ EXPORT_SYMBOL(simple_write_begin); /** * simple_write_end - .write_end helper for non-block-device FSes - * @file: See .write_end of address_space_operations + * @iocb: kernel I/O control block * @mapping: " * @pos: " * @len: " @@ -946,7 +957,8 @@ EXPORT_SYMBOL(simple_write_begin); * simple_write_end does the minimum needed for updating a folio after * writing is done. It has the same API signature as the .write_end of * address_space_operations vector. So it can just be set onto .write_end for - * FSes that don't need any other processing. i_mutex is assumed to be held. + * FSes that don't need any other processing. i_rwsem is assumed to be held + * exclusively. * Block based filesystems should use generic_write_end(). * NOTE: Even though i_size might get updated by this function, mark_inode_dirty * is not called, so a filesystem that actually does store data in .write_inode @@ -955,9 +967,10 @@ EXPORT_SYMBOL(simple_write_begin); * * Use *ONLY* with simple_read_folio() */ -static int simple_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int simple_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = folio->mapping->host; loff_t last_pos = pos + copied; @@ -973,7 +986,7 @@ static int simple_write_end(struct file *file, struct address_space *mapping, } /* * No need to use i_size_read() here, the i_size - * cannot change under us because we hold the i_mutex. + * cannot change under us because we hold the i_rwsem. */ if (last_pos > inode->i_size) i_size_write(inode, last_pos); @@ -1583,13 +1596,17 @@ EXPORT_SYMBOL(generic_file_fsync); int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks) { u64 last_fs_block = num_blocks - 1; - u64 last_fs_page = - last_fs_block >> (PAGE_SHIFT - blocksize_bits); + u64 last_fs_page, max_bytes; + + if (check_shl_overflow(num_blocks, blocksize_bits, &max_bytes)) + return -EFBIG; + + last_fs_page = (max_bytes >> PAGE_SHIFT) - 1; if (unlikely(num_blocks == 0)) return 0; - if ((blocksize_bits < 9) || (blocksize_bits > PAGE_SHIFT)) + if (blocksize_bits < 9) return -EINVAL; if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) || @@ -1649,12 +1666,10 @@ struct inode *alloc_anon_inode(struct super_block *s) */ inode->i_state = I_DIRTY; /* - * Historically anonymous inodes didn't have a type at all and - * userspace has come to rely on this. Internally they're just - * regular files but S_IFREG is masked off when reporting - * information to userspace. + * Historically anonymous inodes don't have a type at all and + * userspace has come to rely on this. */ - inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; + inode->i_mode = S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_flags |= S_PRIVATE | S_ANON_INODE; @@ -1950,22 +1965,22 @@ static const struct dentry_operations generic_encrypted_dentry_ops = { * @sb: superblock to be configured * * Filesystems supporting casefolding and/or fscrypt can call this - * helper at mount-time to configure sb->s_d_op to best set of dentry - * operations required for the enabled features. The helper must be - * called after these have been configured, but before the root dentry - * is created. + * helper at mount-time to configure default dentry_operations to the + * best set of dentry operations required for the enabled features. + * The helper must be called after these have been configured, but + * before the root dentry is created. */ void generic_set_sb_d_ops(struct super_block *sb) { #if IS_ENABLED(CONFIG_UNICODE) if (sb->s_encoding) { - sb->s_d_op = &generic_ci_dentry_ops; + set_default_d_op(sb, &generic_ci_dentry_ops); return; } #endif #ifdef CONFIG_FS_ENCRYPTION if (sb->s_cop) { - sb->s_d_op = &generic_encrypted_dentry_ops; + set_default_d_op(sb, &generic_encrypted_dentry_ops); return; } #endif @@ -2128,6 +2143,8 @@ struct dentry *stashed_dentry_get(struct dentry **stashed) dentry = rcu_dereference(*stashed); if (!dentry) return NULL; + if (IS_ERR(dentry)) + return dentry; if (!lockref_get_not_dead(&dentry->d_lockref)) return NULL; return dentry; @@ -2160,7 +2177,6 @@ static struct dentry *prepare_anon_dentry(struct dentry **stashed, /* Notice when this is changed. */ WARN_ON_ONCE(!S_ISREG(inode->i_mode)); - WARN_ON_ONCE(!IS_IMMUTABLE(inode)); dentry = d_alloc_anon(sb); if (!dentry) { @@ -2176,8 +2192,7 @@ static struct dentry *prepare_anon_dentry(struct dentry **stashed, return dentry; } -static struct dentry *stash_dentry(struct dentry **stashed, - struct dentry *dentry) +struct dentry *stash_dentry(struct dentry **stashed, struct dentry *dentry) { guard(rcu)(); for (;;) { @@ -2218,14 +2233,16 @@ static struct dentry *stash_dentry(struct dentry **stashed, int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, struct path *path) { - struct dentry *dentry; + struct dentry *dentry, *res; const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info; /* See if dentry can be reused. */ - path->dentry = stashed_dentry_get(stashed); - if (path->dentry) { + res = stashed_dentry_get(stashed); + if (IS_ERR(res)) + return PTR_ERR(res); + if (res) { sops->put_data(data); - goto out_path; + goto make_path; } /* Allocate a new dentry. */ @@ -2234,14 +2251,22 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, return PTR_ERR(dentry); /* Added a new dentry. @data is now owned by the filesystem. */ - path->dentry = stash_dentry(stashed, dentry); - if (path->dentry != dentry) + if (sops->stash_dentry) + res = sops->stash_dentry(stashed, dentry); + else + res = stash_dentry(stashed, dentry); + if (IS_ERR(res)) { + dput(dentry); + return PTR_ERR(res); + } + if (res != dentry) dput(dentry); -out_path: - WARN_ON_ONCE(path->dentry->d_fsdata != stashed); - WARN_ON_ONCE(d_inode(path->dentry)->i_private != data); +make_path: + path->dentry = res; path->mnt = mntget(mnt); + VFS_WARN_ON_ONCE(path->dentry->d_fsdata != stashed); + VFS_WARN_ON_ONCE(d_inode(path->dentry)->i_private != data); return 0; } @@ -2263,3 +2288,28 @@ void stashed_dentry_prune(struct dentry *dentry) */ cmpxchg(stashed, dentry, NULL); } + +/* parent must be held exclusive */ +struct dentry *simple_start_creating(struct dentry *parent, const char *name) +{ + struct dentry *dentry; + struct inode *dir = d_inode(parent); + + inode_lock(dir); + if (unlikely(IS_DEADDIR(dir))) { + inode_unlock(dir); + return ERR_PTR(-ENOENT); + } + dentry = lookup_noperm(&QSTR(name), parent); + if (IS_ERR(dentry)) { + inode_unlock(dir); + return dentry; + } + if (dentry->d_inode) { + dput(dentry); + inode_unlock(dir); + return ERR_PTR(-EEXIST); + } + return dentry; +} +EXPORT_SYMBOL(simple_start_creating); diff --git a/fs/locks.c b/fs/locks.c index 1619cddfa7a4..559f02aa4172 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -712,7 +712,7 @@ static void __locks_wake_up_blocks(struct file_lock_core *blocker) fl->fl_lmops && fl->fl_lmops->lm_notify) fl->fl_lmops->lm_notify(fl); else - locks_wake_up(fl); + locks_wake_up_waiter(waiter); /* * The setting of flc_blocker to NULL marks the "done" @@ -1794,7 +1794,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **pr /* * In the delegation case we need mutual exclusion with - * a number of operations that take the i_mutex. We trylock + * a number of operations that take the i_rwsem. We trylock * because delegations are an optional optimization, and if * there's some chance of a conflict--we'd rather not * bother, maybe that's a sign this just isn't a good file to diff --git a/fs/minix/dir.c b/fs/minix/dir.c index dd2a425b41f0..19052fc47e9e 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -45,7 +45,7 @@ static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len) struct address_space *mapping = folio->mapping; struct inode *dir = mapping->host; - block_write_end(NULL, mapping, pos, len, len, folio, NULL); + block_write_end(pos, len, len, folio); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); diff --git a/fs/minix/file.c b/fs/minix/file.c index 906d192ab7f3..dca7ac71f049 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -17,7 +17,7 @@ const struct file_operations minix_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, - .mmap = generic_file_mmap, + .mmap_prepare = generic_file_mmap_prepare, .fsync = generic_file_fsync, .splice_read = filemap_splice_read, }; diff --git a/fs/minix/inode.c b/fs/minix/inode.c index f007e389d5d2..df9d11479caf 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -442,9 +442,10 @@ static void minix_write_failed(struct address_space *mapping, loff_t to) } } -static int minix_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int minix_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; diff --git a/fs/mount.h b/fs/mount.h index ad7173037924..97737051a8b9 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -44,7 +44,6 @@ struct mountpoint { struct hlist_node m_hash; struct dentry *m_dentry; struct hlist_head m_list; - int m_count; }; struct mount { @@ -70,8 +69,8 @@ struct mount { struct list_head mnt_list; struct list_head mnt_expire; /* link in fs-specific expiry list */ struct list_head mnt_share; /* circular list of shared mounts */ - struct list_head mnt_slave_list;/* list of slave mounts */ - struct list_head mnt_slave; /* slave list entry */ + struct hlist_head mnt_slave_list;/* list of slave mounts */ + struct hlist_node mnt_slave; /* slave list entry */ struct mount *mnt_master; /* slave is on master->mnt_slave_list */ struct mnt_namespace *mnt_ns; /* containing namespace */ struct mountpoint *mnt_mp; /* where is it mounted */ @@ -79,21 +78,38 @@ struct mount { struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ struct hlist_node mnt_umount; }; - struct list_head mnt_umounting; /* list entry for umount propagation */ #ifdef CONFIG_FSNOTIFY struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; struct list_head to_notify; /* need to queue notification */ struct mnt_namespace *prev_ns; /* previous namespace (NULL if none) */ #endif + int mnt_t_flags; /* namespace_sem-protected flags */ int mnt_id; /* mount identifier, reused */ u64 mnt_id_unique; /* mount ID unique until reboot */ int mnt_group_id; /* peer group identifier */ int mnt_expiry_mark; /* true if marked for expiry */ struct hlist_head mnt_pins; struct hlist_head mnt_stuck_children; + struct mount *overmount; /* mounted on ->mnt_root */ } __randomize_layout; +enum { + T_SHARED = 1, /* mount is shared */ + T_UNBINDABLE = 2, /* mount is unbindable */ + T_MARKED = 4, /* internal mark for propagate_... */ + T_UMOUNT_CANDIDATE = 8, /* for propagate_umount */ + + /* + * T_SHARED_MASK is the set of flags that should be cleared when a + * mount becomes shared. Currently, this is only the flag that says a + * mount cannot be bind mounted, since this is how we create a mount + * that shares events with another mount. If you add a new T_* + * flag, consider how it interacts with shared mounts. + */ + T_SHARED_MASK = T_UNBINDABLE, +}; + #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */ static inline struct mount *real_mount(struct vfsmount *mnt) @@ -101,7 +117,7 @@ static inline struct mount *real_mount(struct vfsmount *mnt) return container_of(mnt, struct mount, mnt); } -static inline int mnt_has_parent(struct mount *mnt) +static inline int mnt_has_parent(const struct mount *mnt) { return mnt != mnt->mnt_parent; } @@ -146,8 +162,8 @@ struct proc_mounts { extern const struct seq_operations mounts_op; -extern bool __is_local_mountpoint(struct dentry *dentry); -static inline bool is_local_mountpoint(struct dentry *dentry) +extern bool __is_local_mountpoint(const struct dentry *dentry); +static inline bool is_local_mountpoint(const struct dentry *dentry) { if (!d_mountpoint(dentry)) return false; @@ -160,6 +176,13 @@ static inline bool is_anon_ns(struct mnt_namespace *ns) return ns->seq == 0; } +static inline bool anon_ns_root(const struct mount *m) +{ + struct mnt_namespace *ns = READ_ONCE(m->mnt_ns); + + return !IS_ERR_OR_NULL(ns) && is_anon_ns(ns) && m == ns->root; +} + static inline bool mnt_ns_attached(const struct mount *mnt) { return !RB_EMPTY_NODE(&mnt->mnt_node); @@ -170,7 +193,7 @@ static inline bool mnt_ns_empty(const struct mnt_namespace *ns) return RB_EMPTY_ROOT(&ns->mounts); } -static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) +static inline void move_from_ns(struct mount *mnt) { struct mnt_namespace *ns = mnt->mnt_ns; WARN_ON(!mnt_ns_attached(mnt)); @@ -180,7 +203,6 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) ns->mnt_first_node = rb_next(&mnt->mnt_node); rb_erase(&mnt->mnt_node, &ns->mounts); RB_CLEAR_NODE(&mnt->mnt_node); - list_add_tail(&mnt->mnt_list, dt_list); } bool has_locked_children(struct mount *mnt, struct dentry *dentry); diff --git a/fs/namei.c b/fs/namei.c index 4bb889fc980b..cd43ff89fbaa 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1012,10 +1012,10 @@ static int set_root(struct nameidata *nd) unsigned seq; do { - seq = read_seqcount_begin(&fs->seq); + seq = read_seqbegin(&fs->seq); nd->root = fs->root; nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); + } while (read_seqretry(&fs->seq, seq)); } else { get_fs_root(fs, &nd->root); nd->state |= ND_ROOT_GRABBED; @@ -1469,7 +1469,7 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, int ret = 0; while (flags & DCACHE_MANAGED_DENTRY) { - /* Allow the filesystem to manage the transit without i_mutex + /* Allow the filesystem to manage the transit without i_rwsem * being held. */ if (flags & DCACHE_MANAGE_TRANSIT) { ret = path->dentry->d_op->d_manage(path, false); @@ -1665,9 +1665,17 @@ static struct dentry *lookup_dcache(const struct qstr *name, return dentry; } -static struct dentry *lookup_one_qstr_excl_raw(const struct qstr *name, - struct dentry *base, - unsigned int flags) +/* + * Parent directory has inode locked exclusive. This is one + * and only case when ->lookup() gets called on non in-lookup + * dentries - as the matter of fact, this only gets called + * when directory is guaranteed to have no in-lookup children + * at all. + * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed. + * Will return -EEXIST if name is found and LOOKUP_EXCL was passed. + */ +struct dentry *lookup_one_qstr_excl(const struct qstr *name, + struct dentry *base, unsigned int flags) { struct dentry *dentry; struct dentry *old; @@ -1675,7 +1683,7 @@ static struct dentry *lookup_one_qstr_excl_raw(const struct qstr *name, dentry = lookup_dcache(name, base, flags); if (dentry) - return dentry; + goto found; /* Don't create child dentry for a dead directory. */ dir = base->d_inode; @@ -1691,24 +1699,7 @@ static struct dentry *lookup_one_qstr_excl_raw(const struct qstr *name, dput(dentry); dentry = old; } - return dentry; -} - -/* - * Parent directory has inode locked exclusive. This is one - * and only case when ->lookup() gets called on non in-lookup - * dentries - as the matter of fact, this only gets called - * when directory is guaranteed to have no in-lookup children - * at all. - * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed. - * Will return -EEXIST if name is found and LOOKUP_EXCL was passed. - */ -struct dentry *lookup_one_qstr_excl(const struct qstr *name, - struct dentry *base, unsigned int flags) -{ - struct dentry *dentry; - - dentry = lookup_one_qstr_excl_raw(name, base, flags); +found: if (IS_ERR(dentry)) return dentry; if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) { @@ -2580,11 +2571,11 @@ static const char *path_init(struct nameidata *nd, unsigned flags) unsigned seq; do { - seq = read_seqcount_begin(&fs->seq); + seq = read_seqbegin(&fs->seq); nd->path = fs->pwd; nd->inode = nd->path.dentry->d_inode; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); + } while (read_seqretry(&fs->seq, seq)); } else { get_fs_pwd(current->fs, &nd->path); nd->inode = nd->path.dentry->d_inode; @@ -2790,7 +2781,7 @@ struct dentry *kern_path_locked_negative(const char *name, struct path *path) if (unlikely(type != LAST_NORM)) return ERR_PTR(-EINVAL); inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); - d = lookup_one_qstr_excl_raw(&last, parent_path.dentry, 0); + d = lookup_one_qstr_excl(&last, parent_path.dentry, LOOKUP_CREATE); if (IS_ERR(d)) { inode_unlock(parent_path.dentry->d_inode); return d; @@ -2917,7 +2908,8 @@ static int lookup_one_common(struct mnt_idmap *idmap, * @base: base directory to lookup from * * Look up a dentry by name in the dcache, returning NULL if it does not - * currently exist. The function does not try to create a dentry. + * currently exist. The function does not try to create a dentry and if one + * is found it doesn't try to revalidate it. * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. It does no permission checking. @@ -2933,7 +2925,7 @@ struct dentry *try_lookup_noperm(struct qstr *name, struct dentry *base) if (err) return ERR_PTR(err); - return lookup_dcache(name, base, 0); + return d_lookup(base, name); } EXPORT_SYMBOL(try_lookup_noperm); @@ -2945,7 +2937,7 @@ EXPORT_SYMBOL(try_lookup_noperm); * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. It does no permission checking. * - * The caller must hold base->i_mutex. + * The caller must hold base->i_rwsem. */ struct dentry *lookup_noperm(struct qstr *name, struct dentry *base) { @@ -2971,7 +2963,7 @@ EXPORT_SYMBOL(lookup_noperm); * * This can be used for in-kernel filesystem clients such as file servers. * - * The caller must hold base->i_mutex. + * The caller must hold base->i_rwsem. */ struct dentry *lookup_one(struct mnt_idmap *idmap, struct qstr *name, struct dentry *base) @@ -3057,14 +3049,22 @@ EXPORT_SYMBOL(lookup_one_positive_unlocked); * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. It does no permission checking. * - * Unlike lookup_noperm, it should be called without the parent + * Unlike lookup_noperm(), it should be called without the parent * i_rwsem held, and will take the i_rwsem itself if necessary. + * + * Unlike try_lookup_noperm() it *does* revalidate the dentry if it already + * existed. */ struct dentry *lookup_noperm_unlocked(struct qstr *name, struct dentry *base) { struct dentry *ret; + int err; - ret = try_lookup_noperm(name, base); + err = lookup_noperm_common(name, base); + if (err) + return ERR_PTR(err); + + ret = lookup_dcache(name, base, 0); if (!ret) ret = lookup_slow(name, base, 0); return ret; @@ -3471,7 +3471,7 @@ static int may_open(struct mnt_idmap *idmap, const struct path *path, return -EACCES; break; default: - VFS_BUG_ON_INODE(1, inode); + VFS_BUG_ON_INODE(!IS_ANON_FILE(inode), inode); } error = inode_permission(idmap, inode, MAY_OPEN | acc_mode); @@ -4542,13 +4542,13 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) * @dentry: victim * @delegated_inode: returns victim inode, if the inode is delegated. * - * The caller must hold dir->i_mutex. + * The caller must hold dir->i_rwsem exclusively. * * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and * return a reference to the inode in delegated_inode. The caller * should then break the delegation on that inode and retry. Because * breaking a delegation may take a long time, the caller should drop - * dir->i_mutex before doing so. + * dir->i_rwsem before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not @@ -4607,7 +4607,7 @@ EXPORT_SYMBOL(vfs_unlink); /* * Make sure that the actual truncation of the file will occur outside its - * directory's i_mutex. Truncate can take a long time if there is a lot of + * directory's i_rwsem. Truncate can take a long time if there is a lot of * writeout happening, and we don't want to prevent access to the directory * while waiting on the I/O. */ @@ -4785,13 +4785,13 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn * @new_dentry: where to create the new link * @delegated_inode: returns inode needing a delegation break * - * The caller must hold dir->i_mutex + * The caller must hold dir->i_rwsem exclusively. * * If vfs_link discovers a delegation on the to-be-linked file in need * of breaking, it will return -EWOULDBLOCK and return a reference to the * inode in delegated_inode. The caller should then break the delegation * and retry. Because breaking a delegation may take a long time, the - * caller should drop the i_mutex before doing so. + * caller should drop the i_rwsem before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not @@ -4987,7 +4987,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * c) we may have to lock up to _four_ objects - parents and victim (if it exists), * and source (if it's a non-directory or a subdirectory that moves to * different parent). - * And that - after we got ->i_mutex on parents (until then we don't know + * And that - after we got ->i_rwsem on parents (until then we don't know * whether the target exists). Solution: try to be smart with locking * order for inodes. We rely on the fact that tree topology may change * only under ->s_vfs_rename_mutex _and_ that parent of the object we @@ -4999,15 +4999,16 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * has no more than 1 dentry. If "hybrid" objects will ever appear, * we'd better make sure that there's no link(2) for them. * d) conversion from fhandle to dentry may come in the wrong moment - when - * we are removing the target. Solution: we will have to grab ->i_mutex + * we are removing the target. Solution: we will have to grab ->i_rwsem * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on - * ->i_mutex on parents, which works but leads to some truly excessive + * ->i_rwsem on parents, which works but leads to some truly excessive * locking]. */ int vfs_rename(struct renamedata *rd) { int error; - struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir; + struct inode *old_dir = d_inode(rd->old_parent); + struct inode *new_dir = d_inode(rd->new_parent); struct dentry *old_dentry = rd->old_dentry; struct dentry *new_dentry = rd->new_dentry; struct inode **delegated_inode = rd->delegated_inode; @@ -5266,10 +5267,10 @@ retry_deleg: if (error) goto exit5; - rd.old_dir = old_path.dentry->d_inode; + rd.old_parent = old_path.dentry; rd.old_dentry = old_dentry; rd.old_mnt_idmap = mnt_idmap(old_path.mnt); - rd.new_dir = new_path.dentry->d_inode; + rd.new_parent = new_path.dentry; rd.new_dentry = new_dentry; rd.new_mnt_idmap = mnt_idmap(new_path.mnt); rd.delegated_inode = &delegated_inode; diff --git a/fs/namespace.c b/fs/namespace.c index 2f2e93927f46..ddfd4457d338 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -79,6 +79,7 @@ static struct kmem_cache *mnt_cache __ro_after_init; static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ +static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */ static DEFINE_SEQLOCK(mnt_ns_tree_lock); #ifdef CONFIG_FSNOTIFY @@ -380,10 +381,9 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_list); INIT_LIST_HEAD(&mnt->mnt_expire); INIT_LIST_HEAD(&mnt->mnt_share); - INIT_LIST_HEAD(&mnt->mnt_slave_list); - INIT_LIST_HEAD(&mnt->mnt_slave); + INIT_HLIST_HEAD(&mnt->mnt_slave_list); + INIT_HLIST_NODE(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); - INIT_LIST_HEAD(&mnt->mnt_umounting); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); RB_CLEAR_NODE(&mnt->mnt_node); mnt->mnt.mnt_idmap = &nop_mnt_idmap; @@ -894,7 +894,7 @@ struct vfsmount *lookup_mnt(const struct path *path) * namespace not just a mount that happens to have some specified * parent mount. */ -bool __is_local_mountpoint(struct dentry *dentry) +bool __is_local_mountpoint(const struct dentry *dentry) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct mount *mnt, *n; @@ -911,42 +911,48 @@ bool __is_local_mountpoint(struct dentry *dentry) return is_covered; } -static struct mountpoint *lookup_mountpoint(struct dentry *dentry) +struct pinned_mountpoint { + struct hlist_node node; + struct mountpoint *mp; +}; + +static bool lookup_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m) { struct hlist_head *chain = mp_hash(dentry); struct mountpoint *mp; hlist_for_each_entry(mp, chain, m_hash) { if (mp->m_dentry == dentry) { - mp->m_count++; - return mp; + hlist_add_head(&m->node, &mp->m_list); + m->mp = mp; + return true; } } - return NULL; + return false; } -static struct mountpoint *get_mountpoint(struct dentry *dentry) +static int get_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m) { - struct mountpoint *mp, *new = NULL; + struct mountpoint *mp __free(kfree) = NULL; + bool found; int ret; if (d_mountpoint(dentry)) { /* might be worth a WARN_ON() */ if (d_unlinked(dentry)) - return ERR_PTR(-ENOENT); + return -ENOENT; mountpoint: read_seqlock_excl(&mount_lock); - mp = lookup_mountpoint(dentry); + found = lookup_mountpoint(dentry, m); read_sequnlock_excl(&mount_lock); - if (mp) - goto done; + if (found) + return 0; } - if (!new) - new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); - if (!new) - return ERR_PTR(-ENOMEM); - + if (!mp) + mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); + if (!mp) + return -ENOMEM; /* Exactly one processes may set d_mounted */ ret = d_set_mounted(dentry); @@ -956,34 +962,28 @@ mountpoint: goto mountpoint; /* The dentry is not available as a mountpoint? */ - mp = ERR_PTR(ret); if (ret) - goto done; + return ret; /* Add the new mountpoint to the hash table */ read_seqlock_excl(&mount_lock); - new->m_dentry = dget(dentry); - new->m_count = 1; - hlist_add_head(&new->m_hash, mp_hash(dentry)); - INIT_HLIST_HEAD(&new->m_list); + mp->m_dentry = dget(dentry); + hlist_add_head(&mp->m_hash, mp_hash(dentry)); + INIT_HLIST_HEAD(&mp->m_list); + hlist_add_head(&m->node, &mp->m_list); + m->mp = no_free_ptr(mp); read_sequnlock_excl(&mount_lock); - - mp = new; - new = NULL; -done: - kfree(new); - return mp; + return 0; } /* * vfsmount lock must be held. Additionally, the caller is responsible * for serializing calls for given disposal list. */ -static void __put_mountpoint(struct mountpoint *mp, struct list_head *list) +static void maybe_free_mountpoint(struct mountpoint *mp, struct list_head *list) { - if (!--mp->m_count) { + if (hlist_empty(&mp->m_list)) { struct dentry *dentry = mp->m_dentry; - BUG_ON(!hlist_empty(&mp->m_list)); spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_MOUNTED; spin_unlock(&dentry->d_lock); @@ -993,10 +993,15 @@ static void __put_mountpoint(struct mountpoint *mp, struct list_head *list) } } -/* called with namespace_lock and vfsmount lock */ -static void put_mountpoint(struct mountpoint *mp) +/* + * locks: mount_lock [read_seqlock_excl], namespace_sem [excl] + */ +static void unpin_mountpoint(struct pinned_mountpoint *m) { - __put_mountpoint(mp, &ex_mountpoints); + if (m->mp) { + hlist_del(&m->node); + maybe_free_mountpoint(m->mp, &ex_mountpoints); + } } static inline int check_mnt(struct mount *mnt) @@ -1038,11 +1043,14 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns) } /* - * vfsmount lock must be held for write + * locks: mount_lock[write_seqlock] */ -static struct mountpoint *unhash_mnt(struct mount *mnt) +static void __umount_mnt(struct mount *mnt, struct list_head *shrink_list) { struct mountpoint *mp; + struct mount *parent = mnt->mnt_parent; + if (unlikely(parent->overmount == mnt)) + parent->overmount = NULL; mnt->mnt_parent = mnt; mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); @@ -1050,15 +1058,15 @@ static struct mountpoint *unhash_mnt(struct mount *mnt) hlist_del_init(&mnt->mnt_mp_list); mp = mnt->mnt_mp; mnt->mnt_mp = NULL; - return mp; + maybe_free_mountpoint(mp, shrink_list); } /* - * vfsmount lock must be held for write + * locks: mount_lock[write_seqlock], namespace_sem[excl] (for ex_mountpoints) */ static void umount_mnt(struct mount *mnt) { - put_mountpoint(unhash_mnt(mnt)); + __umount_mnt(mnt, &ex_mountpoints); } /* @@ -1068,43 +1076,17 @@ void mnt_set_mountpoint(struct mount *mnt, struct mountpoint *mp, struct mount *child_mnt) { - mp->m_count++; - mnt_add_count(mnt, 1); /* essentially, that's mntget */ child_mnt->mnt_mountpoint = mp->m_dentry; child_mnt->mnt_parent = mnt; child_mnt->mnt_mp = mp; hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); } -/** - * mnt_set_mountpoint_beneath - mount a mount beneath another one - * - * @new_parent: the source mount - * @top_mnt: the mount beneath which @new_parent is mounted - * @new_mp: the new mountpoint of @top_mnt on @new_parent - * - * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and - * parent @top_mnt->mnt_parent and mount it on top of @new_parent at - * @new_mp. And mount @new_parent on the old parent and old - * mountpoint of @top_mnt. - * - * Context: This function expects namespace_lock() and lock_mount_hash() - * to have been acquired in that order. - */ -static void mnt_set_mountpoint_beneath(struct mount *new_parent, - struct mount *top_mnt, - struct mountpoint *new_mp) -{ - struct mount *old_top_parent = top_mnt->mnt_parent; - struct mountpoint *old_top_mp = top_mnt->mnt_mp; - - mnt_set_mountpoint(old_top_parent, old_top_mp, new_parent); - mnt_change_mountpoint(new_parent, new_mp, top_mnt); -} - - -static void __attach_mnt(struct mount *mnt, struct mount *parent) +static void make_visible(struct mount *mnt) { + struct mount *parent = mnt->mnt_parent; + if (unlikely(mnt->mnt_mountpoint == parent->mnt.mnt_root)) + parent->overmount = mnt; hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mnt->mnt_mountpoint)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); @@ -1116,51 +1098,34 @@ static void __attach_mnt(struct mount *mnt, struct mount *parent) * @parent: the parent * @mnt: the new mount * @mp: the new mountpoint - * @beneath: whether to mount @mnt beneath or on top of @parent * - * If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt + * Mount @mnt at @mp on @parent. Then attach @mnt * to @parent's child mount list and to @mount_hashtable. * - * If @beneath is true, remove @mnt from its current parent and - * mountpoint and mount it on @mp on @parent, and mount @parent on the - * old parent and old mountpoint of @mnt. Finally, attach @parent to - * @mnt_hashtable and @parent->mnt_parent->mnt_mounts. - * - * Note, when __attach_mnt() is called @mnt->mnt_parent already points + * Note, when make_visible() is called @mnt->mnt_parent already points * to the correct parent. * * Context: This function expects namespace_lock() and lock_mount_hash() * to have been acquired in that order. */ static void attach_mnt(struct mount *mnt, struct mount *parent, - struct mountpoint *mp, bool beneath) + struct mountpoint *mp) { - if (beneath) - mnt_set_mountpoint_beneath(mnt, parent, mp); - else - mnt_set_mountpoint(parent, mp, mnt); - /* - * Note, @mnt->mnt_parent has to be used. If @mnt was mounted - * beneath @parent then @mnt will need to be attached to - * @parent's old parent, not @parent. IOW, @mnt->mnt_parent - * isn't the same mount as @parent. - */ - __attach_mnt(mnt, mnt->mnt_parent); + mnt_set_mountpoint(parent, mp, mnt); + make_visible(mnt); } void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt) { struct mountpoint *old_mp = mnt->mnt_mp; - struct mount *old_parent = mnt->mnt_parent; list_del_init(&mnt->mnt_child); hlist_del_init(&mnt->mnt_mp_list); hlist_del_init_rcu(&mnt->mnt_hash); - attach_mnt(mnt, parent, mp, false); + attach_mnt(mnt, parent, mp); - put_mountpoint(old_mp); - mnt_add_count(old_parent, -1); + maybe_free_mountpoint(old_mp, &ex_mountpoints); } static inline struct mount *node_to_mount(struct rb_node *node) @@ -1197,32 +1162,6 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) mnt_notify_add(mnt); } -/* - * vfsmount lock must be held for write - */ -static void commit_tree(struct mount *mnt) -{ - struct mount *parent = mnt->mnt_parent; - struct mount *m; - LIST_HEAD(head); - struct mnt_namespace *n = parent->mnt_ns; - - BUG_ON(parent == mnt); - - list_add_tail(&head, &mnt->mnt_list); - while (!list_empty(&head)) { - m = list_first_entry(&head, typeof(*m), mnt_list); - list_del(&m->mnt_list); - - mnt_add_to_ns(n, m); - } - n->nr_mounts += n->pending_mounts; - n->pending_mounts = 0; - - __attach_mnt(mnt, parent); - touch_mnt_namespace(n); -} - static struct mount *next_mnt(struct mount *p, struct mount *root) { struct list_head *next = p->mnt_mounts.next; @@ -1249,6 +1188,27 @@ static struct mount *skip_mnt_tree(struct mount *p) return p; } +/* + * vfsmount lock must be held for write + */ +static void commit_tree(struct mount *mnt) +{ + struct mnt_namespace *n = mnt->mnt_parent->mnt_ns; + + if (!mnt_ns_attached(mnt)) { + for (struct mount *m = mnt; m; m = next_mnt(m, mnt)) + if (unlikely(mnt_ns_attached(m))) + m = skip_mnt_tree(m); + else + mnt_add_to_ns(n, m); + n->nr_mounts += n->pending_mounts; + n->pending_mounts = 0; + } + + make_visible(mnt); + touch_mnt_namespace(n); +} + /** * vfs_create_mount - Create a mount for a configured superblock * @fc: The configuration context with the superblock attached @@ -1296,6 +1256,15 @@ struct vfsmount *fc_mount(struct fs_context *fc) } EXPORT_SYMBOL(fc_mount); +struct vfsmount *fc_mount_longterm(struct fs_context *fc) +{ + struct vfsmount *mnt = fc_mount(fc); + if (!IS_ERR(mnt)) + real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; + return mnt; +} +EXPORT_SYMBOL(fc_mount_longterm); + struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) @@ -1337,7 +1306,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, if (!mnt) return ERR_PTR(-ENOMEM); - if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) + mnt->mnt.mnt_flags = READ_ONCE(old->mnt.mnt_flags) & + ~MNT_INTERNAL_FLAGS; + + if (flag & (CL_SLAVE | CL_PRIVATE)) mnt->mnt_group_id = 0; /* not a peer of original */ else mnt->mnt_group_id = old->mnt_group_id; @@ -1348,8 +1320,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, goto out_free; } - mnt->mnt.mnt_flags = old->mnt.mnt_flags; - mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); + if (mnt->mnt_group_id) + set_mnt_shared(mnt); atomic_inc(&sb->s_active); mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); @@ -1362,30 +1334,19 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, list_add_tail(&mnt->mnt_instance, &sb->s_mounts); unlock_mount_hash(); - if ((flag & CL_SLAVE) || - ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { - list_add(&mnt->mnt_slave, &old->mnt_slave_list); + if (flag & CL_PRIVATE) // we are done with it + return mnt; + + if (peers(mnt, old)) + list_add(&mnt->mnt_share, &old->mnt_share); + + if ((flag & CL_SLAVE) && old->mnt_group_id) { + hlist_add_head(&mnt->mnt_slave, &old->mnt_slave_list); mnt->mnt_master = old; - CLEAR_MNT_SHARED(mnt); - } else if (!(flag & CL_PRIVATE)) { - if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) - list_add(&mnt->mnt_share, &old->mnt_share); - if (IS_MNT_SLAVE(old)) - list_add(&mnt->mnt_slave, &old->mnt_slave); + } else if (IS_MNT_SLAVE(old)) { + hlist_add_behind(&mnt->mnt_slave, &old->mnt_slave); mnt->mnt_master = old->mnt_master; - } else { - CLEAR_MNT_SHARED(mnt); } - if (flag & CL_MAKE_SHARED) - set_mnt_shared(mnt); - - /* stick the duplicate mount on the same expiry list - * as the original if that was on one */ - if (flag & CL_EXPIRE) { - if (!list_empty(&old->mnt_expire)) - list_add(&mnt->mnt_expire, &old->mnt_expire); - } - return mnt; out_free: @@ -1478,11 +1439,13 @@ static void mntput_no_expire(struct mount *mnt) rcu_read_unlock(); list_del(&mnt->mnt_instance); + if (unlikely(!list_empty(&mnt->mnt_expire))) + list_del(&mnt->mnt_expire); if (unlikely(!list_empty(&mnt->mnt_mounts))) { struct mount *p, *tmp; list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) { - __put_mountpoint(unhash_mnt(p), &list); + __umount_mnt(p, &list); hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children); } } @@ -1679,23 +1642,19 @@ const struct seq_operations mounts_op = { int may_umount_tree(struct vfsmount *m) { struct mount *mnt = real_mount(m); - int actual_refs = 0; - int minimum_refs = 0; - struct mount *p; - BUG_ON(!m); + bool busy = false; /* write lock needed for mnt_get_count */ lock_mount_hash(); - for (p = mnt; p; p = next_mnt(p, mnt)) { - actual_refs += mnt_get_count(p); - minimum_refs += 2; + for (struct mount *p = mnt; p; p = next_mnt(p, mnt)) { + if (mnt_get_count(p) > (p == mnt ? 2 : 1)) { + busy = true; + break; + } } unlock_mount_hash(); - if (actual_refs > minimum_refs) - return 0; - - return 1; + return !busy; } EXPORT_SYMBOL(may_umount_tree); @@ -1771,15 +1730,18 @@ static bool need_notify_mnt_list(void) } #endif +static void free_mnt_ns(struct mnt_namespace *); static void namespace_unlock(void) { struct hlist_head head; struct hlist_node *p; struct mount *m; + struct mnt_namespace *ns = emptied_ns; LIST_HEAD(list); hlist_move_list(&unmounted, &head); list_splice_init(&ex_mountpoints, &list); + emptied_ns = NULL; if (need_notify_mnt_list()) { /* @@ -1793,6 +1755,11 @@ static void namespace_unlock(void) } else { up_write(&namespace_sem); } + if (unlikely(ns)) { + /* Make sure we notice when we leak mounts. */ + VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); + free_mnt_ns(ns); + } shrink_dentry_list(&list); @@ -1865,9 +1832,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) for (p = mnt; p; p = next_mnt(p, mnt)) { p->mnt.mnt_flags |= MNT_UMOUNT; if (mnt_ns_attached(p)) - move_from_ns(p, &tmp_list); - else - list_move(&p->mnt_list, &tmp_list); + move_from_ns(p); + list_add_tail(&p->mnt_list, &tmp_list); } /* Hide the mounts from mnt_mounts */ @@ -1896,7 +1862,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) disconnect = disconnect_mount(p, how); if (mnt_has_parent(p)) { - mnt_add_count(p->mnt_parent, -1); if (!disconnect) { /* Don't forget about p */ list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts); @@ -1973,7 +1938,7 @@ static int do_umount(struct mount *mnt, int flags) * all race cases, but it's a slowpath. */ lock_mount_hash(); - if (mnt_get_count(mnt) != 2) { + if (!list_empty(&mnt->mnt_mounts) || mnt_get_count(mnt) != 2) { unlock_mount_hash(); return -EBUSY; } @@ -2019,23 +1984,27 @@ static int do_umount(struct mount *mnt, int flags) namespace_lock(); lock_mount_hash(); - /* Recheck MNT_LOCKED with the locks held */ + /* Repeat the earlier racy checks, now that we are holding the locks */ retval = -EINVAL; + if (!check_mnt(mnt)) + goto out; + if (mnt->mnt.mnt_flags & MNT_LOCKED) goto out; + if (!mnt_has_parent(mnt)) /* not the absolute root */ + goto out; + event++; if (flags & MNT_DETACH) { - if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list)) - umount_tree(mnt, UMOUNT_PROPAGATE); + umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { smp_mb(); // paired with __legitimize_mnt() shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { - if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list)) - umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); + umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); retval = 0; } } @@ -2053,29 +2022,28 @@ out: * detach_mounts allows lazily unmounting those mounts instead of * leaking them. * - * The caller may hold dentry->d_inode->i_mutex. + * The caller may hold dentry->d_inode->i_rwsem. */ void __detach_mounts(struct dentry *dentry) { - struct mountpoint *mp; + struct pinned_mountpoint mp = {}; struct mount *mnt; namespace_lock(); lock_mount_hash(); - mp = lookup_mountpoint(dentry); - if (!mp) + if (!lookup_mountpoint(dentry, &mp)) goto out_unlock; event++; - while (!hlist_empty(&mp->m_list)) { - mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); + while (mp.node.next) { + mnt = hlist_entry(mp.node.next, struct mount, mnt_mp_list); if (mnt->mnt.mnt_flags & MNT_UMOUNT) { umount_mnt(mnt); hlist_add_head(&mnt->mnt_umount, &unmounted); } else umount_tree(mnt, UMOUNT_CONNECTED); } - put_mountpoint(mp); + unpin_mountpoint(&mp); out_unlock: unlock_mount_hash(); namespace_unlock(); @@ -2259,7 +2227,6 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, return dst_mnt; src_parent = src_root; - dst_mnt->mnt_mountpoint = src_root->mnt_mountpoint; list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) { if (!is_subdir(src_root_child->mnt_mountpoint, dentry)) @@ -2294,8 +2261,16 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, if (IS_ERR(dst_mnt)) goto out; lock_mount_hash(); - list_add_tail(&dst_mnt->mnt_list, &res->mnt_list); - attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp, false); + if (src_mnt->mnt.mnt_flags & MNT_LOCKED) + dst_mnt->mnt.mnt_flags |= MNT_LOCKED; + if (unlikely(flag & CL_EXPIRE)) { + /* stick the duplicate mount on the same expiry + * list as the original if that was on one */ + if (!list_empty(&src_mnt->mnt_expire)) + list_add(&dst_mnt->mnt_expire, + &src_mnt->mnt_expire); + } + attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp); unlock_mount_hash(); } } @@ -2310,107 +2285,97 @@ out: return dst_mnt; } -/* Caller should check returned pointer for errors */ - -struct vfsmount *collect_mounts(const struct path *path) +static inline bool extend_array(struct path **res, struct path **to_free, + unsigned n, unsigned *count, unsigned new_count) { - struct mount *tree; - namespace_lock(); - if (!check_mnt(real_mount(path->mnt))) - tree = ERR_PTR(-EINVAL); - else - tree = copy_tree(real_mount(path->mnt), path->dentry, - CL_COPY_ALL | CL_PRIVATE); - namespace_unlock(); - if (IS_ERR(tree)) - return ERR_CAST(tree); - return &tree->mnt; -} + struct path *p; -static void free_mnt_ns(struct mnt_namespace *); -static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); + if (likely(n < *count)) + return true; + p = kmalloc_array(new_count, sizeof(struct path), GFP_KERNEL); + if (p && *count) + memcpy(p, *res, *count * sizeof(struct path)); + *count = new_count; + kfree(*to_free); + *to_free = *res = p; + return p; +} -static inline bool must_dissolve(struct mnt_namespace *mnt_ns) +struct path *collect_paths(const struct path *path, + struct path *prealloc, unsigned count) { - /* - * This mount belonged to an anonymous mount namespace - * but was moved to a non-anonymous mount namespace and - * then unmounted. - */ - if (unlikely(!mnt_ns)) - return false; + struct mount *root = real_mount(path->mnt); + struct mount *child; + struct path *res = prealloc, *to_free = NULL; + unsigned n = 0; - /* - * This mount belongs to a non-anonymous mount namespace - * and we know that such a mount can never transition to - * an anonymous mount namespace again. - */ - if (!is_anon_ns(mnt_ns)) { - /* - * A detached mount either belongs to an anonymous mount - * namespace or a non-anonymous mount namespace. It - * should never belong to something purely internal. - */ - VFS_WARN_ON_ONCE(mnt_ns == MNT_NS_INTERNAL); - return false; + guard(rwsem_read)(&namespace_sem); + + if (!check_mnt(root)) + return ERR_PTR(-EINVAL); + if (!extend_array(&res, &to_free, 0, &count, 32)) + return ERR_PTR(-ENOMEM); + res[n++] = *path; + list_for_each_entry(child, &root->mnt_mounts, mnt_child) { + if (!is_subdir(child->mnt_mountpoint, path->dentry)) + continue; + for (struct mount *m = child; m; m = next_mnt(m, child)) { + if (!extend_array(&res, &to_free, n, &count, 2 * count)) + return ERR_PTR(-ENOMEM); + res[n].mnt = &m->mnt; + res[n].dentry = m->mnt.mnt_root; + n++; + } } + if (!extend_array(&res, &to_free, n, &count, count + 1)) + return ERR_PTR(-ENOMEM); + memset(res + n, 0, (count - n) * sizeof(struct path)); + for (struct path *p = res; p->mnt; p++) + path_get(p); + return res; +} - return true; +void drop_collected_paths(struct path *paths, struct path *prealloc) +{ + for (struct path *p = paths; p->mnt; p++) + path_put(p); + if (paths != prealloc) + kfree(paths); } +static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); + void dissolve_on_fput(struct vfsmount *mnt) { - struct mnt_namespace *ns; struct mount *m = real_mount(mnt); + /* + * m used to be the root of anon namespace; if it still is one, + * we need to dissolve the mount tree and free that namespace. + * Let's try to avoid taking namespace_sem if we can determine + * that there's nothing to do without it - rcu_read_lock() is + * enough to make anon_ns_root() memory-safe and once m has + * left its namespace, it's no longer our concern, since it will + * never become a root of anon ns again. + */ + scoped_guard(rcu) { - if (!must_dissolve(READ_ONCE(m->mnt_ns))) + if (!anon_ns_root(m)) return; } scoped_guard(namespace_lock, &namespace_sem) { - ns = m->mnt_ns; - if (!must_dissolve(ns)) - return; - - /* - * After must_dissolve() we know that this is a detached - * mount in an anonymous mount namespace. - * - * Now when mnt_has_parent() reports that this mount - * tree has a parent, we know that this anonymous mount - * tree has been moved to another anonymous mount - * namespace. - * - * So when closing this file we cannot unmount the mount - * tree. This will be done when the file referring to - * the root of the anonymous mount namespace will be - * closed (It could already be closed but it would sync - * on @namespace_sem and wait for us to finish.). - */ - if (mnt_has_parent(m)) + if (!anon_ns_root(m)) return; + emptied_ns = m->mnt_ns; lock_mount_hash(); umount_tree(m, UMOUNT_CONNECTED); unlock_mount_hash(); } - - /* Make sure we notice when we leak mounts. */ - VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); - free_mnt_ns(ns); } -void drop_collected_mounts(struct vfsmount *mnt) -{ - namespace_lock(); - lock_mount_hash(); - umount_tree(real_mount(mnt), 0); - unlock_mount_hash(); - namespace_unlock(); -} - -bool has_locked_children(struct mount *mnt, struct dentry *dentry) +static bool __has_locked_children(struct mount *mnt, struct dentry *dentry) { struct mount *child; @@ -2424,6 +2389,16 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry) return false; } +bool has_locked_children(struct mount *mnt, struct dentry *dentry) +{ + bool res; + + read_seqlock_excl(&mount_lock); + res = __has_locked_children(mnt, dentry); + read_sequnlock_excl(&mount_lock); + return res; +} + /* * Check that there aren't references to earlier/same mount namespaces in the * specified subtree. Such references can act as pins for mount namespaces @@ -2468,23 +2443,25 @@ struct vfsmount *clone_private_mount(const struct path *path) if (IS_MNT_UNBINDABLE(old_mnt)) return ERR_PTR(-EINVAL); - if (mnt_has_parent(old_mnt)) { - if (!check_mnt(old_mnt)) - return ERR_PTR(-EINVAL); - } else { - if (!is_mounted(&old_mnt->mnt)) - return ERR_PTR(-EINVAL); - - /* Make sure this isn't something purely kernel internal. */ - if (!is_anon_ns(old_mnt->mnt_ns)) + /* + * Make sure the source mount is acceptable. + * Anything mounted in our mount namespace is allowed. + * Otherwise, it must be the root of an anonymous mount + * namespace, and we need to make sure no namespace + * loops get created. + */ + if (!check_mnt(old_mnt)) { + if (!anon_ns_root(old_mnt)) return ERR_PTR(-EINVAL); - /* Make sure we don't create mount namespace loops. */ if (!check_for_nsfs_mounts(old_mnt)) return ERR_PTR(-EINVAL); } - if (has_locked_children(old_mnt, path->dentry)) + if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + if (__has_locked_children(old_mnt, path->dentry)) return ERR_PTR(-EINVAL); new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); @@ -2497,21 +2474,6 @@ struct vfsmount *clone_private_mount(const struct path *path) } EXPORT_SYMBOL_GPL(clone_private_mount); -int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, - struct vfsmount *root) -{ - struct mount *mnt; - int res = f(root, arg); - if (res) - return res; - list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { - res = f(&mnt->mnt, arg); - if (res) - return res; - } - return 0; -} - static void lock_mnt_tree(struct mount *mnt) { struct mount *p; @@ -2533,7 +2495,7 @@ static void lock_mnt_tree(struct mount *mnt) if (flags & MNT_NOEXEC) flags |= MNT_LOCK_NOEXEC; /* Don't allow unprivileged users to reveal what is under a mount */ - if (list_empty(&p->mnt_expire)) + if (list_empty(&p->mnt_expire) && p != mnt) flags |= MNT_LOCKED; p->mnt.mnt_flags = flags; } @@ -2554,7 +2516,7 @@ static int invent_group_ids(struct mount *mnt, bool recurse) struct mount *p; for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { - if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { + if (!p->mnt_group_id) { int err = mnt_alloc_group_id(p); if (err) { cleanup_group_ids(mnt, p); @@ -2590,17 +2552,15 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt) } enum mnt_tree_flags_t { - MNT_TREE_MOVE = BIT(0), - MNT_TREE_BENEATH = BIT(1), - MNT_TREE_PROPAGATION = BIT(2), + MNT_TREE_BENEATH = BIT(0), + MNT_TREE_PROPAGATION = BIT(1), }; /** * attach_recursive_mnt - attach a source mount tree * @source_mnt: mount tree to be attached - * @top_mnt: mount that @source_mnt will be mounted on or mounted beneath + * @dest_mnt: mount that @source_mnt will be mounted on * @dest_mp: the mountpoint @source_mnt will be mounted at - * @flags: modify how @source_mnt is supposed to be attached * * NOTE: in the table below explains the semantics when a source mount * of a given type is attached to a destination mount of a given type. @@ -2663,26 +2623,31 @@ enum mnt_tree_flags_t { * Otherwise a negative error code is returned. */ static int attach_recursive_mnt(struct mount *source_mnt, - struct mount *top_mnt, - struct mountpoint *dest_mp, - enum mnt_tree_flags_t flags) + struct mount *dest_mnt, + struct mountpoint *dest_mp) { struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; HLIST_HEAD(tree_list); - struct mnt_namespace *ns = top_mnt->mnt_ns; - struct mountpoint *smp; - struct mount *child, *dest_mnt, *p; + struct mnt_namespace *ns = dest_mnt->mnt_ns; + struct pinned_mountpoint root = {}; + struct mountpoint *shorter = NULL; + struct mount *child, *p; + struct mount *top; struct hlist_node *n; int err = 0; - bool moving = flags & MNT_TREE_MOVE, beneath = flags & MNT_TREE_BENEATH; + bool moving = mnt_has_parent(source_mnt); /* * Preallocate a mountpoint in case the new mounts need to be * mounted beneath mounts on the same mountpoint. */ - smp = get_mountpoint(source_mnt->mnt.mnt_root); - if (IS_ERR(smp)) - return PTR_ERR(smp); + for (top = source_mnt; unlikely(top->overmount); top = top->overmount) { + if (!shorter && is_mnt_ns_file(top->mnt.mnt_root)) + shorter = top->mnt_mp; + } + err = get_mountpoint(top->mnt.mnt_root, &root); + if (err) + return err; /* Is there space to add these mounts to the mount namespace? */ if (!moving) { @@ -2691,11 +2656,6 @@ static int attach_recursive_mnt(struct mount *source_mnt, goto out; } - if (beneath) - dest_mnt = top_mnt->mnt_parent; - else - dest_mnt = top_mnt; - if (IS_MNT_SHARED(dest_mnt)) { err = invent_group_ids(source_mnt, true); if (err) @@ -2712,42 +2672,50 @@ static int attach_recursive_mnt(struct mount *source_mnt, } if (moving) { - if (beneath) - dest_mp = smp; - unhash_mnt(source_mnt); - attach_mnt(source_mnt, top_mnt, dest_mp, beneath); + umount_mnt(source_mnt); mnt_notify_add(source_mnt); - touch_mnt_namespace(source_mnt->mnt_ns); + /* if the mount is moved, it should no longer be expired + * automatically */ + list_del_init(&source_mnt->mnt_expire); } else { if (source_mnt->mnt_ns) { - LIST_HEAD(head); - /* move from anon - the caller will destroy */ + emptied_ns = source_mnt->mnt_ns; for (p = source_mnt; p; p = next_mnt(p, source_mnt)) - move_from_ns(p, &head); - list_del_init(&head); + move_from_ns(p); } - if (beneath) - mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp); - else - mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); - commit_tree(source_mnt); } + mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); + /* + * Now the original copy is in the same state as the secondaries - + * its root attached to mountpoint, but not hashed and all mounts + * in it are either in our namespace or in no namespace at all. + * Add the original to the list of copies and deal with the + * rest of work for all of them uniformly. + */ + hlist_add_head(&source_mnt->mnt_hash, &tree_list); + hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { struct mount *q; hlist_del_init(&child->mnt_hash); - q = __lookup_mnt(&child->mnt_parent->mnt, - child->mnt_mountpoint); - if (q) - mnt_change_mountpoint(child, smp, q); /* Notice when we are propagating across user namespaces */ if (child->mnt_parent->mnt_ns->user_ns != user_ns) lock_mnt_tree(child); - child->mnt.mnt_flags &= ~MNT_LOCKED; + q = __lookup_mnt(&child->mnt_parent->mnt, + child->mnt_mountpoint); + if (q) { + struct mountpoint *mp = root.mp; + struct mount *r = child; + while (unlikely(r->overmount)) + r = r->overmount; + if (unlikely(shorter) && child != source_mnt) + mp = shorter; + mnt_change_mountpoint(r, mp, q); + } commit_tree(child); } - put_mountpoint(smp); + unpin_mountpoint(&root); unlock_mount_hash(); return 0; @@ -2764,7 +2732,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, ns->pending_mounts = 0; read_seqlock_excl(&mount_lock); - put_mountpoint(smp); + unpin_mountpoint(&root); read_sequnlock_excl(&mount_lock); return err; @@ -2804,12 +2772,12 @@ static int attach_recursive_mnt(struct mount *source_mnt, * Return: Either the target mountpoint on the top mount or the top * mount's mountpoint. */ -static struct mountpoint *do_lock_mount(struct path *path, bool beneath) +static int do_lock_mount(struct path *path, struct pinned_mountpoint *pinned, bool beneath) { struct vfsmount *mnt = path->mnt; struct dentry *dentry; - struct mountpoint *mp = ERR_PTR(-ENOENT); struct path under = {}; + int err = -ENOENT; for (;;) { struct mount *m = real_mount(mnt); @@ -2847,8 +2815,8 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath) path->dentry = dget(mnt->mnt_root); continue; // got overmounted } - mp = get_mountpoint(dentry); - if (IS_ERR(mp)) + err = get_mountpoint(dentry, pinned); + if (err) break; if (beneath) { /* @@ -2859,25 +2827,25 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath) */ path_put(&under); } - return mp; + return 0; } namespace_unlock(); inode_unlock(dentry->d_inode); if (beneath) path_put(&under); - return mp; + return err; } -static inline struct mountpoint *lock_mount(struct path *path) +static inline int lock_mount(struct path *path, struct pinned_mountpoint *m) { - return do_lock_mount(path, false); + return do_lock_mount(path, m, false); } -static void unlock_mount(struct mountpoint *where) +static void unlock_mount(struct pinned_mountpoint *m) { - inode_unlock(where->m_dentry->d_inode); + inode_unlock(m->mp->m_dentry->d_inode); read_seqlock_excl(&mount_lock); - put_mountpoint(where); + unpin_mountpoint(m); read_sequnlock_excl(&mount_lock); namespace_unlock(); } @@ -2891,7 +2859,7 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) d_is_dir(mnt->mnt.mnt_root)) return -ENOTDIR; - return attach_recursive_mnt(mnt, p, mp, 0); + return attach_recursive_mnt(mnt, p, mp); } /* @@ -2930,16 +2898,18 @@ static int do_change_type(struct path *path, int ms_flags) return -EINVAL; namespace_lock(); + if (!check_mnt(mnt)) { + err = -EINVAL; + goto out_unlock; + } if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); if (err) goto out_unlock; } - lock_mount_hash(); for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) change_mnt_propagation(m, type); - unlock_mount_hash(); out_unlock: namespace_unlock(); @@ -3013,26 +2983,21 @@ static inline bool may_copy_tree(struct path *path) static struct mount *__do_loopback(struct path *old_path, int recurse) { - struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt); + struct mount *old = real_mount(old_path->mnt); if (IS_MNT_UNBINDABLE(old)) - return mnt; + return ERR_PTR(-EINVAL); if (!may_copy_tree(old_path)) - return mnt; + return ERR_PTR(-EINVAL); - if (!recurse && has_locked_children(old, old_path->dentry)) - return mnt; + if (!recurse && __has_locked_children(old, old_path->dentry)) + return ERR_PTR(-EINVAL); if (recurse) - mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE); + return copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE); else - mnt = clone_mnt(old, old_path->dentry, 0); - - if (!IS_ERR(mnt)) - mnt->mnt.mnt_flags &= ~MNT_LOCKED; - - return mnt; + return clone_mnt(old, old_path->dentry, 0); } /* @@ -3043,7 +3008,7 @@ static int do_loopback(struct path *path, const char *old_name, { struct path old_path; struct mount *mnt = NULL, *parent; - struct mountpoint *mp; + struct pinned_mountpoint mp = {}; int err; if (!old_name || !*old_name) return -EINVAL; @@ -3055,11 +3020,9 @@ static int do_loopback(struct path *path, const char *old_name, if (mnt_ns_loop(old_path.dentry)) goto out; - mp = lock_mount(path); - if (IS_ERR(mp)) { - err = PTR_ERR(mp); + err = lock_mount(path, &mp); + if (err) goto out; - } parent = real_mount(path->mnt); if (!check_mnt(parent)) @@ -3071,14 +3034,14 @@ static int do_loopback(struct path *path, const char *old_name, goto out2; } - err = graft_tree(mnt, parent, mp); + err = graft_tree(mnt, parent, mp.mp); if (err) { lock_mount_hash(); umount_tree(mnt, UMOUNT_SYNC); unlock_mount_hash(); } out2: - unlock_mount(mp); + unlock_mount(&mp); out: path_put(&old_path); return err; @@ -3414,7 +3377,7 @@ static int do_set_group(struct path *from_path, struct path *to_path) goto out; /* From mount should not have locked children in place of To's root */ - if (has_locked_children(from, to->mnt.mnt_root)) + if (__has_locked_children(from, to->mnt.mnt_root)) goto out; /* Setting sharing groups is only allowed on private mounts */ @@ -3426,18 +3389,14 @@ static int do_set_group(struct path *from_path, struct path *to_path) goto out; if (IS_MNT_SLAVE(from)) { - struct mount *m = from->mnt_master; - - list_add(&to->mnt_slave, &m->mnt_slave_list); - to->mnt_master = m; + hlist_add_behind(&to->mnt_slave, &from->mnt_slave); + to->mnt_master = from->mnt_master; } if (IS_MNT_SHARED(from)) { to->mnt_group_id = from->mnt_group_id; list_add(&to->mnt_share, &from->mnt_share); - lock_mount_hash(); set_mnt_shared(to); - unlock_mount_hash(); } err = 0; @@ -3453,18 +3412,36 @@ out: * Check if path is overmounted, i.e., if there's a mount on top of * @path->mnt with @path->dentry as mountpoint. * - * Context: This function expects namespace_lock() to be held. + * Context: namespace_sem must be held at least shared. + * MUST NOT be called under lock_mount_hash() (there one should just + * call __lookup_mnt() and check if it returns NULL). * Return: If path is overmounted true is returned, false if not. */ static inline bool path_overmounted(const struct path *path) { + unsigned seq = read_seqbegin(&mount_lock); + bool no_child; + rcu_read_lock(); - if (unlikely(__lookup_mnt(path->mnt, path->dentry))) { - rcu_read_unlock(); - return true; - } + no_child = !__lookup_mnt(path->mnt, path->dentry); rcu_read_unlock(); - return false; + if (need_seqretry(&mount_lock, seq)) { + read_seqlock_excl(&mount_lock); + no_child = !__lookup_mnt(path->mnt, path->dentry); + read_sequnlock_excl(&mount_lock); + } + return unlikely(!no_child); +} + +/* + * Check if there is a possibly empty chain of descent from p1 to p2. + * Locks: namespace_sem (shared) or mount_lock (read_seqlock_excl). + */ +static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2) +{ + while (p2 != p1 && mnt_has_parent(p2)) + p2 = p2->mnt_parent; + return p2 == p1; } /** @@ -3518,9 +3495,8 @@ static int can_move_mount_beneath(const struct path *from, if (parent_mnt_to == current->nsproxy->mnt_ns->root) return -EINVAL; - for (struct mount *p = mnt_from; mnt_has_parent(p); p = p->mnt_parent) - if (p == mnt_to) - return -EINVAL; + if (mount_is_ancestor(mnt_to, mnt_from)) + return -EINVAL; /* * If the parent mount propagates to the child mount this would @@ -3605,75 +3581,67 @@ static int do_move_mount(struct path *old_path, struct mount *p; struct mount *old; struct mount *parent; - struct mountpoint *mp, *old_mp; + struct pinned_mountpoint mp; int err; - bool attached, beneath = flags & MNT_TREE_BENEATH; + bool beneath = flags & MNT_TREE_BENEATH; - mp = do_lock_mount(new_path, beneath); - if (IS_ERR(mp)) - return PTR_ERR(mp); + err = do_lock_mount(new_path, &mp, beneath); + if (err) + return err; old = real_mount(old_path->mnt); p = real_mount(new_path->mnt); parent = old->mnt_parent; - attached = mnt_has_parent(old); - if (attached) - flags |= MNT_TREE_MOVE; - old_mp = old->mnt_mp; ns = old->mnt_ns; err = -EINVAL; - if (!may_use_mount(p)) - goto out; - /* The thing moved must be mounted... */ - if (!is_mounted(&old->mnt)) - goto out; - - /* ... and either ours or the root of anon namespace */ - if (!(attached ? check_mnt(old) : is_anon_ns(ns))) - goto out; - - if (is_anon_ns(ns) && ns == p->mnt_ns) { + if (check_mnt(old)) { + /* if the source is in our namespace... */ + /* ... it should be detachable from parent */ + if (!mnt_has_parent(old) || IS_MNT_LOCKED(old)) + goto out; + /* ... and the target should be in our namespace */ + if (!check_mnt(p)) + goto out; + /* parent of the source should not be shared */ + if (IS_MNT_SHARED(parent)) + goto out; + } else { /* - * Ending up with two files referring to the root of the - * same anonymous mount namespace would cause an error - * as this would mean trying to move the same mount - * twice into the mount tree which would be rejected - * later. But be explicit about it right here. + * otherwise the source must be the root of some anon namespace. */ - goto out; - } else if (is_anon_ns(p->mnt_ns)) { + if (!anon_ns_root(old)) + goto out; /* - * Don't allow moving an attached mount tree to an - * anonymous mount tree. + * Bail out early if the target is within the same namespace - + * subsequent checks would've rejected that, but they lose + * some corner cases if we check it early. */ - goto out; + if (ns == p->mnt_ns) + goto out; + /* + * Target should be either in our namespace or in an acceptable + * anon namespace, sensu check_anonymous_mnt(). + */ + if (!may_use_mount(p)) + goto out; } - if (old->mnt.mnt_flags & MNT_LOCKED) - goto out; - if (!path_mounted(old_path)) goto out; if (d_is_dir(new_path->dentry) != d_is_dir(old_path->dentry)) goto out; - /* - * Don't move a mount residing in a shared parent. - */ - if (attached && IS_MNT_SHARED(parent)) - goto out; if (beneath) { - err = can_move_mount_beneath(old_path, new_path, mp); + err = can_move_mount_beneath(old_path, new_path, mp.mp); if (err) goto out; err = -EINVAL; p = p->mnt_parent; - flags |= MNT_TREE_BENEATH; } /* @@ -3685,30 +3653,12 @@ static int do_move_mount(struct path *old_path, err = -ELOOP; if (!check_for_nsfs_mounts(old)) goto out; - for (; mnt_has_parent(p); p = p->mnt_parent) - if (p == old) - goto out; - - err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, flags); - if (err) + if (mount_is_ancestor(old, p)) goto out; - /* if the mount is moved, it should no longer be expire - * automatically */ - list_del_init(&old->mnt_expire); - if (attached) - put_mountpoint(old_mp); + err = attach_recursive_mnt(old, p, mp.mp); out: - unlock_mount(mp); - if (!err) { - if (attached) { - mntput_no_expire(parent); - } else { - /* Make sure we notice when we leak mounts. */ - VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); - free_mnt_ns(ns); - } - } + unlock_mount(&mp); return err; } @@ -3769,7 +3719,7 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, unsigned int mnt_flags) { struct vfsmount *mnt; - struct mountpoint *mp; + struct pinned_mountpoint mp = {}; struct super_block *sb = fc->root->d_sb; int error; @@ -3790,13 +3740,12 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, mnt_warn_timestamp_expiry(mountpoint, mnt); - mp = lock_mount(mountpoint); - if (IS_ERR(mp)) { - mntput(mnt); - return PTR_ERR(mp); + error = lock_mount(mountpoint, &mp); + if (!error) { + error = do_add_mount(real_mount(mnt), mp.mp, + mountpoint, mnt_flags); + unlock_mount(&mp); } - error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags); - unlock_mount(mp); if (error < 0) mntput(mnt); return error; @@ -3864,7 +3813,7 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, int finish_automount(struct vfsmount *m, const struct path *path) { struct dentry *dentry = path->dentry; - struct mountpoint *mp; + struct pinned_mountpoint mp = {}; struct mount *mnt; int err; @@ -3896,14 +3845,13 @@ int finish_automount(struct vfsmount *m, const struct path *path) err = 0; goto discard_locked; } - mp = get_mountpoint(dentry); - if (IS_ERR(mp)) { - err = PTR_ERR(mp); + err = get_mountpoint(dentry, &mp); + if (err) goto discard_locked; - } - err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE); - unlock_mount(mp); + err = do_add_mount(mnt, mp.mp, path, + path->mnt->mnt_flags | MNT_SHRINKABLE); + unlock_mount(&mp); if (unlikely(err)) goto discard; return 0; @@ -3912,12 +3860,6 @@ discard_locked: namespace_unlock(); inode_unlock(dentry->d_inode); discard: - /* remove m from any expiration list it may be on */ - if (!list_empty(&mnt->mnt_expire)) { - namespace_lock(); - list_del_init(&mnt->mnt_expire); - namespace_unlock(); - } mntput(m); return err; } @@ -3929,11 +3871,9 @@ discard: */ void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) { - namespace_lock(); - + read_seqlock_excl(&mount_lock); list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); - - namespace_unlock(); + read_sequnlock_excl(&mount_lock); } EXPORT_SYMBOL(mnt_set_expiry); @@ -4287,7 +4227,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, /* First pass: copy the tree topology */ copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != ns->user_ns) - copy_flags |= CL_SHARED_TO_SLAVE; + copy_flags |= CL_SLAVE; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { namespace_unlock(); @@ -4712,7 +4652,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, { struct path new, old, root; struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent; - struct mountpoint *old_mp, *root_mp; + struct pinned_mountpoint old_mp = {}; int error; if (!may_mount()) @@ -4733,9 +4673,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, goto out2; get_fs_root(current->fs, &root); - old_mp = lock_mount(&old); - error = PTR_ERR(old_mp); - if (IS_ERR(old_mp)) + error = lock_mount(&old, &old_mp); + if (error) goto out3; error = -EINVAL; @@ -4762,11 +4701,11 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, if (!path_mounted(&root)) goto out4; /* not a mountpoint */ if (!mnt_has_parent(root_mnt)) - goto out4; /* not attached */ + goto out4; /* absolute root */ if (!path_mounted(&new)) goto out4; /* not a mountpoint */ if (!mnt_has_parent(new_mnt)) - goto out4; /* not attached */ + goto out4; /* absolute root */ /* make sure we can reach put_old from new_root */ if (!is_path_reachable(old_mnt, old.dentry, &new)) goto out4; @@ -4775,29 +4714,25 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, goto out4; lock_mount_hash(); umount_mnt(new_mnt); - root_mp = unhash_mnt(root_mnt); /* we'll need its mountpoint */ if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { new_mnt->mnt.mnt_flags |= MNT_LOCKED; root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; } - /* mount old root on put_old */ - attach_mnt(root_mnt, old_mnt, old_mp, false); /* mount new_root on / */ - attach_mnt(new_mnt, root_parent, root_mp, false); - mnt_add_count(root_parent, -1); + attach_mnt(new_mnt, root_parent, root_mnt->mnt_mp); + umount_mnt(root_mnt); + /* mount old root on put_old */ + attach_mnt(root_mnt, old_mnt, old_mp.mp); touch_mnt_namespace(current->nsproxy->mnt_ns); /* A moved mount should not expire automatically */ list_del_init(&new_mnt->mnt_expire); - put_mountpoint(root_mp); unlock_mount_hash(); mnt_notify_add(root_mnt); mnt_notify_add(new_mnt); chroot_fs_refs(&root, &new); error = 0; out4: - unlock_mount(old_mp); - if (!error) - mntput_no_expire(ex_parent); + unlock_mount(&old_mp); out3: path_put(&root); out2: @@ -4999,22 +4934,7 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) err = -EINVAL; lock_mount_hash(); - /* Ensure that this isn't anything purely vfs internal. */ - if (!is_mounted(&mnt->mnt)) - goto out; - - /* - * If this is an attached mount make sure it's located in the callers - * mount namespace. If it's not don't let the caller interact with it. - * - * If this mount doesn't have a parent it's most often simply a - * detached mount with an anonymous mount namespace. IOW, something - * that's simply not attached yet. But there are apparently also users - * that do change mount properties on the rootfs itself. That obviously - * neither has a parent nor is it a detached mount so we cannot - * unconditionally check for detached mounts. - */ - if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt)) + if (!anon_ns_root(mnt) && !check_mnt(mnt)) goto out; /* @@ -5261,16 +5181,12 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, kattr.kflags |= MOUNT_KATTR_RECURSE; ret = wants_mount_setattr(uattr, usize, &kattr); - if (ret < 0) - return ret; - - if (ret) { + if (ret > 0) { ret = do_mount_setattr(&file->f_path, &kattr); - if (ret) - return ret; - finish_mount_kattr(&kattr); } + if (ret) + return ret; } fd = get_unused_fd_flags(flags & O_CLOEXEC); @@ -5382,7 +5298,7 @@ static void statmount_mnt_basic(struct kstatmount *s) s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id; s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt); s->sm.mnt_propagation = mnt_to_propagation_flags(m); - s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0; + s->sm.mnt_peer_group = m->mnt_group_id; s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0; } @@ -6174,9 +6090,11 @@ static void __init init_mount_tree(void) if (IS_ERR(mnt)) panic("Can't create rootfs"); - ns = alloc_mnt_ns(&init_user_ns, false); + ns = alloc_mnt_ns(&init_user_ns, true); if (IS_ERR(ns)) panic("Can't allocate initial namespace"); + ns->seq = atomic64_inc_return(&mnt_ns_seq); + ns->ns.inum = PROC_MNT_INIT_INO; m = real_mount(mnt); ns->root = m; ns->nr_mounts = 1; @@ -6186,7 +6104,6 @@ static void __init init_mount_tree(void) root.mnt = mnt; root.dentry = mnt->mnt_root; - mnt->mnt_flags |= MNT_LOCKED; set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); @@ -6233,8 +6150,12 @@ void put_mnt_ns(struct mnt_namespace *ns) { if (!refcount_dec_and_test(&ns->ns.count)) return; - drop_collected_mounts(&ns->root->mnt); - free_mnt_ns(ns); + namespace_lock(); + emptied_ns = ns; + lock_mount_hash(); + umount_tree(ns->root, 0); + unlock_mount_hash(); + namespace_unlock(); } struct vfsmount *kern_mount(struct file_system_type *type) diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 0d1b6d35ff3b..18b3dc74c70e 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -78,7 +78,8 @@ static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_in * [!] NOTE: This must be run in the same thread as ->issue_read() was called * in as we access the readahead_control struct. */ -static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq) +static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq, + struct readahead_control *ractl) { struct netfs_io_request *rreq = subreq->rreq; size_t rsize = subreq->len; @@ -86,7 +87,7 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq) if (subreq->source == NETFS_DOWNLOAD_FROM_SERVER) rsize = umin(rsize, rreq->io_streams[0].sreq_max_len); - if (rreq->ractl) { + if (ractl) { /* If we don't have sufficient folios in the rolling buffer, * extract a folioq's worth from the readahead region at a time * into the buffer. Note that this acquires a ref on each page @@ -99,7 +100,7 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq) while (rreq->submitted < subreq->start + rsize) { ssize_t added; - added = rolling_buffer_load_from_ra(&rreq->buffer, rreq->ractl, + added = rolling_buffer_load_from_ra(&rreq->buffer, ractl, &put_batch); if (added < 0) return added; @@ -211,7 +212,8 @@ static void netfs_issue_read(struct netfs_io_request *rreq, * slicing up the region to be read according to available cache blocks and * network rsize. */ -static void netfs_read_to_pagecache(struct netfs_io_request *rreq) +static void netfs_read_to_pagecache(struct netfs_io_request *rreq, + struct readahead_control *ractl) { struct netfs_inode *ictx = netfs_inode(rreq->inode); unsigned long long start = rreq->start; @@ -262,9 +264,9 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) if (ret < 0) { subreq->error = ret; /* Not queued - release both refs. */ - netfs_put_subrequest(subreq, false, + netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); - netfs_put_subrequest(subreq, false, + netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); break; } @@ -291,14 +293,14 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) break; issue: - slice = netfs_prepare_read_iterator(subreq); + slice = netfs_prepare_read_iterator(subreq, ractl); if (slice < 0) { ret = slice; subreq->error = ret; trace_netfs_sreq(subreq, netfs_sreq_trace_cancel); /* Not queued - release both refs. */ - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); break; } size -= slice; @@ -312,7 +314,7 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) if (unlikely(size > 0)) { smp_wmb(); /* Write lists before ALL_QUEUED. */ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); - netfs_wake_read_collector(rreq); + netfs_wake_collector(rreq); } /* Defer error return as we may need to wait for outstanding I/O. */ @@ -359,18 +361,15 @@ void netfs_readahead(struct readahead_control *ractl) netfs_rreq_expand(rreq, ractl); - rreq->ractl = ractl; rreq->submitted = rreq->start; if (rolling_buffer_init(&rreq->buffer, rreq->debug_id, ITER_DEST) < 0) goto cleanup_free; - netfs_read_to_pagecache(rreq); + netfs_read_to_pagecache(rreq, ractl); - netfs_put_request(rreq, true, netfs_rreq_trace_put_return); - return; + return netfs_put_request(rreq, netfs_rreq_trace_put_return); cleanup_free: - netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); - return; + return netfs_put_request(rreq, netfs_rreq_trace_put_failed); } EXPORT_SYMBOL(netfs_readahead); @@ -389,7 +388,6 @@ static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct fo if (added < 0) return added; rreq->submitted = rreq->start + added; - rreq->ractl = (struct readahead_control *)1UL; return 0; } @@ -459,7 +457,7 @@ static int netfs_read_gaps(struct file *file, struct folio *folio) iov_iter_bvec(&rreq->buffer.iter, ITER_DEST, bvec, i, rreq->len); rreq->submitted = rreq->start + flen; - netfs_read_to_pagecache(rreq); + netfs_read_to_pagecache(rreq, NULL); if (sink) folio_put(sink); @@ -470,11 +468,11 @@ static int netfs_read_gaps(struct file *file, struct folio *folio) folio_mark_uptodate(folio); } folio_unlock(folio); - netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + netfs_put_request(rreq, netfs_rreq_trace_put_return); return ret < 0 ? ret : 0; discard: - netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); + netfs_put_request(rreq, netfs_rreq_trace_put_discard); alloc_error: folio_unlock(folio); return ret; @@ -528,13 +526,13 @@ int netfs_read_folio(struct file *file, struct folio *folio) if (ret < 0) goto discard; - netfs_read_to_pagecache(rreq); + netfs_read_to_pagecache(rreq, NULL); ret = netfs_wait_for_read(rreq); - netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + netfs_put_request(rreq, netfs_rreq_trace_put_return); return ret < 0 ? ret : 0; discard: - netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); + netfs_put_request(rreq, netfs_rreq_trace_put_discard); alloc_error: folio_unlock(folio); return ret; @@ -685,11 +683,11 @@ retry: if (ret < 0) goto error_put; - netfs_read_to_pagecache(rreq); + netfs_read_to_pagecache(rreq, NULL); ret = netfs_wait_for_read(rreq); if (ret < 0) goto error; - netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + netfs_put_request(rreq, netfs_rreq_trace_put_return); have_folio: ret = folio_wait_private_2_killable(folio); @@ -701,7 +699,7 @@ have_folio_no_wait: return 0; error_put: - netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); + netfs_put_request(rreq, netfs_rreq_trace_put_failed); error: if (folio) { folio_unlock(folio); @@ -750,13 +748,13 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, if (ret < 0) goto error_put; - netfs_read_to_pagecache(rreq); + netfs_read_to_pagecache(rreq, NULL); ret = netfs_wait_for_read(rreq); - netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + netfs_put_request(rreq, netfs_rreq_trace_put_return); return ret < 0 ? ret : 0; error_put: - netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); + netfs_put_request(rreq, netfs_rreq_trace_put_discard); error: _leave(" = %d", ret); return ret; diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index b4826360a411..f27ea5099a68 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -53,30 +53,40 @@ static struct folio *netfs_grab_folio_for_write(struct address_space *mapping, * data written into the pagecache until we can find out from the server what * the values actually are. */ -static void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode, - loff_t i_size, loff_t pos, size_t copied) +void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode, + loff_t pos, size_t copied) { + loff_t i_size, end = pos + copied; blkcnt_t add; size_t gap; + if (end <= i_size_read(inode)) + return; + if (ctx->ops->update_i_size) { - ctx->ops->update_i_size(inode, pos); + ctx->ops->update_i_size(inode, end); return; } - i_size_write(inode, pos); + spin_lock(&inode->i_lock); + + i_size = i_size_read(inode); + if (end > i_size) { + i_size_write(inode, end); #if IS_ENABLED(CONFIG_FSCACHE) - fscache_update_cookie(ctx->cache, NULL, &pos); + fscache_update_cookie(ctx->cache, NULL, &end); #endif - gap = SECTOR_SIZE - (i_size & (SECTOR_SIZE - 1)); - if (copied > gap) { - add = DIV_ROUND_UP(copied - gap, SECTOR_SIZE); + gap = SECTOR_SIZE - (i_size & (SECTOR_SIZE - 1)); + if (copied > gap) { + add = DIV_ROUND_UP(copied - gap, SECTOR_SIZE); - inode->i_blocks = min_t(blkcnt_t, - DIV_ROUND_UP(pos, SECTOR_SIZE), - inode->i_blocks + add); + inode->i_blocks = min_t(blkcnt_t, + DIV_ROUND_UP(end, SECTOR_SIZE), + inode->i_blocks + add); + } } + spin_unlock(&inode->i_lock); } /** @@ -111,12 +121,11 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, struct folio *folio = NULL, *writethrough = NULL; unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0; ssize_t written = 0, ret, ret2; - loff_t i_size, pos = iocb->ki_pos; + loff_t pos = iocb->ki_pos; size_t max_chunk = mapping_max_folio_size(mapping); bool maybe_trouble = false; - if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) || - iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) + if (unlikely(iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) ) { wbc_attach_fdatawrite_inode(&wbc, mapping->host); @@ -345,10 +354,8 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, flush_dcache_folio(folio); /* Update the inode size if we moved the EOF marker */ + netfs_update_i_size(ctx, inode, pos, copied); pos += copied; - i_size = i_size_read(inode); - if (pos > i_size) - netfs_update_i_size(ctx, inode, i_size, pos, copied); written += copied; if (likely(!wreq)) { @@ -386,7 +393,7 @@ out: wbc_detach_inode(&wbc); if (ret2 == -EIOCBQUEUED) return ret2; - if (ret == 0) + if (ret == 0 && ret2 < 0) ret = ret2; } diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c index 5e3f0aeb51f3..a05e13472baf 100644 --- a/fs/netfs/direct_read.c +++ b/fs/netfs/direct_read.c @@ -85,7 +85,7 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) if (rreq->netfs_ops->prepare_read) { ret = rreq->netfs_ops->prepare_read(subreq); if (ret < 0) { - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); break; } } @@ -103,19 +103,16 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) rreq->netfs_ops->issue_read(subreq); if (test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) - netfs_wait_for_pause(rreq); + netfs_wait_for_paused_read(rreq); if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) break; - if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) && - test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags)) - break; cond_resched(); } while (size > 0); if (unlikely(size > 0)) { smp_wmb(); /* Write lists before ALL_QUEUED. */ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); - netfs_wake_read_collector(rreq); + netfs_wake_collector(rreq); } return ret; @@ -144,7 +141,7 @@ static ssize_t netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync) ret = netfs_dispatch_unbuffered_reads(rreq); if (!rreq->submitted) { - netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit); + netfs_put_request(rreq, netfs_rreq_trace_put_no_submit); inode_dio_end(rreq->inode); ret = 0; goto out; @@ -188,7 +185,8 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i rreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp, iocb->ki_pos, orig_count, - NETFS_DIO_READ); + iocb->ki_flags & IOCB_DIRECT ? + NETFS_DIO_READ : NETFS_UNBUFFERED_READ); if (IS_ERR(rreq)) return PTR_ERR(rreq); @@ -236,7 +234,7 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i } out: - netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + netfs_put_request(rreq, netfs_rreq_trace_put_return); if (ret > 0) orig_count -= ret; return ret; diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index 42ce53cc216e..a16660ab7f83 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -9,20 +9,6 @@ #include <linux/uio.h> #include "internal.h" -static void netfs_cleanup_dio_write(struct netfs_io_request *wreq) -{ - struct inode *inode = wreq->inode; - unsigned long long end = wreq->start + wreq->transferred; - - if (!wreq->error && - i_size_read(inode) < end) { - if (wreq->netfs_ops->update_i_size) - wreq->netfs_ops->update_i_size(inode, end); - else - i_size_write(inode, end); - } -} - /* * Perform an unbuffered write where we may have to do an RMW operation on an * encrypted file. This can also be used for direct I/O writes. @@ -87,6 +73,8 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * } __set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags); + if (async) + __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags); /* Copy the data into the bounce buffer and encrypt it. */ // TODO @@ -96,7 +84,6 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * if (async) wreq->iocb = iocb; wreq->len = iov_iter_count(&wreq->buffer.iter); - wreq->cleanup = netfs_cleanup_dio_write; ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len); if (ret < 0) { _debug("begin = %zd", ret); @@ -104,20 +91,15 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * } if (!async) { - trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip); - wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, - TASK_UNINTERRUPTIBLE); - ret = wreq->error; - if (ret == 0) { - ret = wreq->transferred; + ret = netfs_wait_for_write(wreq); + if (ret > 0) iocb->ki_pos += ret; - } } else { ret = -EIOCBQUEUED; } out: - netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + netfs_put_request(wreq, netfs_rreq_trace_put_return); return ret; } EXPORT_SYMBOL(netfs_unbuffered_write_iter_locked); diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c index b1722a82c03d..e4308457633c 100644 --- a/fs/netfs/fscache_io.c +++ b/fs/netfs/fscache_io.c @@ -192,8 +192,7 @@ EXPORT_SYMBOL(__fscache_clear_page_bits); /* * Deal with the completion of writing the data to the cache. */ -static void fscache_wreq_done(void *priv, ssize_t transferred_or_error, - bool was_async) +static void fscache_wreq_done(void *priv, ssize_t transferred_or_error) { struct fscache_write_request *wreq = priv; @@ -202,8 +201,7 @@ static void fscache_wreq_done(void *priv, ssize_t transferred_or_error, wreq->set_bits); if (wreq->term_func) - wreq->term_func(wreq->term_func_priv, transferred_or_error, - was_async); + wreq->term_func(wreq->term_func_priv, transferred_or_error); fscache_end_operation(&wreq->cache_resources); kfree(wreq); } @@ -255,14 +253,14 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, return; abandon_end: - return fscache_wreq_done(wreq, ret, false); + return fscache_wreq_done(wreq, ret); abandon_free: kfree(wreq); abandon: if (using_pgpriv2) fscache_clear_page_bits(mapping, start, len, cond); if (term_func) - term_func(term_func_priv, ret, false); + term_func(term_func_priv, ret); } EXPORT_SYMBOL(__fscache_write_to_cache); diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 1c4f953c3d68..d4f16fefd965 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -23,11 +23,17 @@ /* * buffered_read.c */ -void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async); +void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error); int netfs_prefetch_for_write(struct file *file, struct folio *folio, size_t offset, size_t len); /* + * buffered_write.c + */ +void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode, + loff_t pos, size_t copied); + +/* * main.c */ extern unsigned int netfs_debug; @@ -62,6 +68,14 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {} struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq, enum netfs_folioq_trace trace); void netfs_reset_iter(struct netfs_io_subrequest *subreq); +void netfs_wake_collector(struct netfs_io_request *rreq); +void netfs_subreq_clear_in_progress(struct netfs_io_subrequest *subreq); +void netfs_wait_for_in_progress_stream(struct netfs_io_request *rreq, + struct netfs_io_stream *stream); +ssize_t netfs_wait_for_read(struct netfs_io_request *rreq); +ssize_t netfs_wait_for_write(struct netfs_io_request *rreq); +void netfs_wait_for_paused_read(struct netfs_io_request *rreq); +void netfs_wait_for_paused_write(struct netfs_io_request *rreq); /* * objects.c @@ -71,9 +85,8 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, loff_t start, size_t len, enum netfs_io_origin origin); void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what); -void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async); -void netfs_put_request(struct netfs_io_request *rreq, bool was_async, - enum netfs_rreq_ref_trace what); +void netfs_clear_subrequests(struct netfs_io_request *rreq); +void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what); struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq); static inline void netfs_see_request(struct netfs_io_request *rreq, @@ -92,11 +105,9 @@ static inline void netfs_see_subrequest(struct netfs_io_subrequest *subreq, /* * read_collect.c */ +bool netfs_read_collection(struct netfs_io_request *rreq); void netfs_read_collection_worker(struct work_struct *work); -void netfs_wake_read_collector(struct netfs_io_request *rreq); -void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async); -ssize_t netfs_wait_for_read(struct netfs_io_request *rreq); -void netfs_wait_for_pause(struct netfs_io_request *rreq); +void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error); /* * read_pgpriv2.c @@ -176,8 +187,8 @@ static inline void netfs_stat_d(atomic_t *stat) * write_collect.c */ int netfs_folio_written_back(struct folio *folio); +bool netfs_write_collection(struct netfs_io_request *wreq); void netfs_write_collection_worker(struct work_struct *work); -void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async); /* * write_issue.c @@ -198,8 +209,8 @@ struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, struct folio *folio, size_t copied, bool to_page_end, struct folio **writethrough_cache); -int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, - struct folio *writethrough_cache); +ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, + struct folio *writethrough_cache); int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len); /* @@ -255,6 +266,39 @@ static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr) } /* + * Clear and wake up a NETFS_RREQ_* flag bit on a request. + */ +static inline void netfs_wake_rreq_flag(struct netfs_io_request *rreq, + unsigned int rreq_flag, + enum netfs_rreq_trace trace) +{ + if (test_bit(rreq_flag, &rreq->flags)) { + clear_bit_unlock(rreq_flag, &rreq->flags); + smp_mb__after_atomic(); /* Set flag before task state */ + trace_netfs_rreq(rreq, trace); + wake_up(&rreq->waitq); + } +} + +/* + * Test the NETFS_RREQ_IN_PROGRESS flag, inserting an appropriate barrier. + */ +static inline bool netfs_check_rreq_in_progress(const struct netfs_io_request *rreq) +{ + /* Order read of flags before read of anything else, such as error. */ + return test_bit_acquire(NETFS_RREQ_IN_PROGRESS, &rreq->flags); +} + +/* + * Test the NETFS_SREQ_IN_PROGRESS flag, inserting an appropriate barrier. + */ +static inline bool netfs_check_subreq_in_progress(const struct netfs_io_subrequest *subreq) +{ + /* Order read of flags before read of anything else, such as error. */ + return test_bit_acquire(NETFS_SREQ_IN_PROGRESS, &subreq->flags); +} + +/* * fscache-cache.c */ #ifdef CONFIG_PROC_FS diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 70ecc8f5f210..73da6c9f5777 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -39,6 +39,7 @@ static const char *netfs_origins[nr__netfs_io_origin] = { [NETFS_READ_GAPS] = "RG", [NETFS_READ_SINGLE] = "R1", [NETFS_READ_FOR_WRITE] = "RW", + [NETFS_UNBUFFERED_READ] = "UR", [NETFS_DIO_READ] = "DR", [NETFS_WRITEBACK] = "WB", [NETFS_WRITEBACK_SINGLE] = "W1", @@ -57,15 +58,15 @@ static int netfs_requests_seq_show(struct seq_file *m, void *v) if (v == &netfs_io_requests) { seq_puts(m, - "REQUEST OR REF FL ERR OPS COVERAGE\n" - "======== == === == ==== === =========\n" + "REQUEST OR REF FLAG ERR OPS COVERAGE\n" + "======== == === ==== ==== === =========\n" ); return 0; } rreq = list_entry(v, struct netfs_io_request, proc_link); seq_printf(m, - "%08x %s %3d %2lx %4ld %3d @%04llx %llx/%llx", + "%08x %s %3d %4lx %4ld %3d @%04llx %llx/%llx", rreq->debug_id, netfs_origins[rreq->origin], refcount_read(&rreq->ref), diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 7099aa07737a..20748bcfbf59 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -313,3 +313,234 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp) return true; } EXPORT_SYMBOL(netfs_release_folio); + +/* + * Wake the collection work item. + */ +void netfs_wake_collector(struct netfs_io_request *rreq) +{ + if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) && + !test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) { + queue_work(system_unbound_wq, &rreq->work); + } else { + trace_netfs_rreq(rreq, netfs_rreq_trace_wake_queue); + wake_up(&rreq->waitq); + } +} + +/* + * Mark a subrequest as no longer being in progress and, if need be, wake the + * collector. + */ +void netfs_subreq_clear_in_progress(struct netfs_io_subrequest *subreq) +{ + struct netfs_io_request *rreq = subreq->rreq; + struct netfs_io_stream *stream = &rreq->io_streams[subreq->stream_nr]; + + clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + smp_mb__after_atomic(); /* Clear IN_PROGRESS before task state */ + + /* If we are at the head of the queue, wake up the collector. */ + if (list_is_first(&subreq->rreq_link, &stream->subrequests) || + test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) + netfs_wake_collector(rreq); +} + +/* + * Wait for all outstanding I/O in a stream to quiesce. + */ +void netfs_wait_for_in_progress_stream(struct netfs_io_request *rreq, + struct netfs_io_stream *stream) +{ + struct netfs_io_subrequest *subreq; + DEFINE_WAIT(myself); + + list_for_each_entry(subreq, &stream->subrequests, rreq_link) { + if (!netfs_check_subreq_in_progress(subreq)) + continue; + + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_quiesce); + for (;;) { + prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); + + if (!netfs_check_subreq_in_progress(subreq)) + break; + + trace_netfs_sreq(subreq, netfs_sreq_trace_wait_for); + schedule(); + } + } + + trace_netfs_rreq(rreq, netfs_rreq_trace_waited_quiesce); + finish_wait(&rreq->waitq, &myself); +} + +/* + * Perform collection in app thread if not offloaded to workqueue. + */ +static int netfs_collect_in_app(struct netfs_io_request *rreq, + bool (*collector)(struct netfs_io_request *rreq)) +{ + bool need_collect = false, inactive = true, done = true; + + if (!netfs_check_rreq_in_progress(rreq)) { + trace_netfs_rreq(rreq, netfs_rreq_trace_recollect); + return 1; /* Done */ + } + + for (int i = 0; i < NR_IO_STREAMS; i++) { + struct netfs_io_subrequest *subreq; + struct netfs_io_stream *stream = &rreq->io_streams[i]; + + if (!stream->active) + continue; + inactive = false; + trace_netfs_collect_stream(rreq, stream); + subreq = list_first_entry_or_null(&stream->subrequests, + struct netfs_io_subrequest, + rreq_link); + if (subreq && + (!netfs_check_subreq_in_progress(subreq) || + test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) { + need_collect = true; + break; + } + if (subreq || !test_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags)) + done = false; + } + + if (!need_collect && !inactive && !done) + return 0; /* Sleep */ + + __set_current_state(TASK_RUNNING); + if (collector(rreq)) { + /* Drop the ref from the NETFS_RREQ_IN_PROGRESS flag. */ + netfs_put_request(rreq, netfs_rreq_trace_put_work_ip); + return 1; /* Done */ + } + + if (inactive) { + WARN(true, "Failed to collect inactive req R=%08x\n", + rreq->debug_id); + cond_resched(); + } + return 2; /* Again */ +} + +/* + * Wait for a request to complete, successfully or otherwise. + */ +static ssize_t netfs_wait_for_in_progress(struct netfs_io_request *rreq, + bool (*collector)(struct netfs_io_request *rreq)) +{ + DEFINE_WAIT(myself); + ssize_t ret; + + for (;;) { + prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); + + if (!test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) { + switch (netfs_collect_in_app(rreq, collector)) { + case 0: + break; + case 1: + goto all_collected; + case 2: + if (!netfs_check_rreq_in_progress(rreq)) + break; + cond_resched(); + continue; + } + } + + if (!netfs_check_rreq_in_progress(rreq)) + break; + + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip); + schedule(); + } + +all_collected: + trace_netfs_rreq(rreq, netfs_rreq_trace_waited_ip); + finish_wait(&rreq->waitq, &myself); + + ret = rreq->error; + if (ret == 0) { + ret = rreq->transferred; + switch (rreq->origin) { + case NETFS_DIO_READ: + case NETFS_DIO_WRITE: + case NETFS_READ_SINGLE: + case NETFS_UNBUFFERED_READ: + case NETFS_UNBUFFERED_WRITE: + break; + default: + if (rreq->submitted < rreq->len) { + trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); + ret = -EIO; + } + break; + } + } + + return ret; +} + +ssize_t netfs_wait_for_read(struct netfs_io_request *rreq) +{ + return netfs_wait_for_in_progress(rreq, netfs_read_collection); +} + +ssize_t netfs_wait_for_write(struct netfs_io_request *rreq) +{ + return netfs_wait_for_in_progress(rreq, netfs_write_collection); +} + +/* + * Wait for a paused operation to unpause or complete in some manner. + */ +static void netfs_wait_for_pause(struct netfs_io_request *rreq, + bool (*collector)(struct netfs_io_request *rreq)) +{ + DEFINE_WAIT(myself); + + for (;;) { + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_pause); + prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); + + if (!test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) { + switch (netfs_collect_in_app(rreq, collector)) { + case 0: + break; + case 1: + goto all_collected; + case 2: + if (!netfs_check_rreq_in_progress(rreq) || + !test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) + break; + cond_resched(); + continue; + } + } + + if (!netfs_check_rreq_in_progress(rreq) || + !test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) + break; + + schedule(); + } + +all_collected: + trace_netfs_rreq(rreq, netfs_rreq_trace_waited_pause); + finish_wait(&rreq->waitq, &myself); +} + +void netfs_wait_for_paused_read(struct netfs_io_request *rreq) +{ + return netfs_wait_for_pause(rreq, netfs_read_collection); +} + +void netfs_wait_for_paused_write(struct netfs_io_request *rreq) +{ + return netfs_wait_for_pause(rreq, netfs_write_collection); +} diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index dc6b41ef18b0..e8c99738b5bb 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -10,6 +10,8 @@ #include <linux/delay.h> #include "internal.h" +static void netfs_free_request(struct work_struct *work); + /* * Allocate an I/O request and initialise it. */ @@ -34,6 +36,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, } memset(rreq, 0, kmem_cache_size(cache)); + INIT_WORK(&rreq->cleanup_work, netfs_free_request); rreq->start = start; rreq->len = len; rreq->origin = origin; @@ -49,13 +52,14 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, INIT_LIST_HEAD(&rreq->io_streams[0].subrequests); INIT_LIST_HEAD(&rreq->io_streams[1].subrequests); init_waitqueue_head(&rreq->waitq); - refcount_set(&rreq->ref, 1); + refcount_set(&rreq->ref, 2); if (origin == NETFS_READAHEAD || origin == NETFS_READPAGE || origin == NETFS_READ_GAPS || origin == NETFS_READ_SINGLE || origin == NETFS_READ_FOR_WRITE || + origin == NETFS_UNBUFFERED_READ || origin == NETFS_DIO_READ) { INIT_WORK(&rreq->work, netfs_read_collection_worker); rreq->io_streams[0].avail = true; @@ -64,8 +68,6 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, } __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); - if (file && file->f_flags & O_NONBLOCK) - __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags); if (rreq->netfs_ops->init_request) { ret = rreq->netfs_ops->init_request(rreq, file); if (ret < 0) { @@ -75,7 +77,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, } atomic_inc(&ctx->io_count); - trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new); + trace_netfs_rreq_ref(rreq->debug_id, refcount_read(&rreq->ref), netfs_rreq_trace_new); netfs_proc_add_rreq(rreq); netfs_stat(&netfs_n_rh_rreq); return rreq; @@ -89,7 +91,7 @@ void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace trace_netfs_rreq_ref(rreq->debug_id, r + 1, what); } -void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async) +void netfs_clear_subrequests(struct netfs_io_request *rreq) { struct netfs_io_subrequest *subreq; struct netfs_io_stream *stream; @@ -101,8 +103,7 @@ void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async) subreq = list_first_entry(&stream->subrequests, struct netfs_io_subrequest, rreq_link); list_del(&subreq->rreq_link); - netfs_put_subrequest(subreq, was_async, - netfs_sreq_trace_put_clear); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_clear); } } } @@ -118,13 +119,19 @@ static void netfs_free_request_rcu(struct rcu_head *rcu) static void netfs_free_request(struct work_struct *work) { struct netfs_io_request *rreq = - container_of(work, struct netfs_io_request, work); + container_of(work, struct netfs_io_request, cleanup_work); struct netfs_inode *ictx = netfs_inode(rreq->inode); unsigned int i; trace_netfs_rreq(rreq, netfs_rreq_trace_free); + + /* Cancel/flush the result collection worker. That does not carry a + * ref of its own, so we must wait for it somewhere. + */ + cancel_work_sync(&rreq->work); + netfs_proc_del_rreq(rreq); - netfs_clear_subrequests(rreq, false); + netfs_clear_subrequests(rreq); if (rreq->netfs_ops->free_request) rreq->netfs_ops->free_request(rreq); if (rreq->cache_resources.ops) @@ -145,8 +152,7 @@ static void netfs_free_request(struct work_struct *work) call_rcu(&rreq->rcu, netfs_free_request_rcu); } -void netfs_put_request(struct netfs_io_request *rreq, bool was_async, - enum netfs_rreq_ref_trace what) +void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what) { unsigned int debug_id; bool dead; @@ -156,15 +162,8 @@ void netfs_put_request(struct netfs_io_request *rreq, bool was_async, debug_id = rreq->debug_id; dead = __refcount_dec_and_test(&rreq->ref, &r); trace_netfs_rreq_ref(debug_id, r - 1, what); - if (dead) { - if (was_async) { - rreq->work.func = netfs_free_request; - if (!queue_work(system_unbound_wq, &rreq->work)) - WARN_ON(1); - } else { - netfs_free_request(&rreq->work); - } - } + if (dead) + WARN_ON(!queue_work(system_unbound_wq, &rreq->cleanup_work)); } } @@ -206,8 +205,7 @@ void netfs_get_subrequest(struct netfs_io_subrequest *subreq, what); } -static void netfs_free_subrequest(struct netfs_io_subrequest *subreq, - bool was_async) +static void netfs_free_subrequest(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; @@ -216,10 +214,10 @@ static void netfs_free_subrequest(struct netfs_io_subrequest *subreq, rreq->netfs_ops->free_subrequest(subreq); mempool_free(subreq, rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool); netfs_stat_d(&netfs_n_rh_sreq); - netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq); + netfs_put_request(rreq, netfs_rreq_trace_put_subreq); } -void netfs_put_subrequest(struct netfs_io_subrequest *subreq, bool was_async, +void netfs_put_subrequest(struct netfs_io_subrequest *subreq, enum netfs_sreq_ref_trace what) { unsigned int debug_index = subreq->debug_index; @@ -230,5 +228,5 @@ void netfs_put_subrequest(struct netfs_io_subrequest *subreq, bool was_async, dead = __refcount_dec_and_test(&subreq->ref, &r); trace_netfs_sreq_ref(debug_id, debug_index, r - 1, what); if (dead) - netfs_free_subrequest(subreq, was_async); + netfs_free_subrequest(subreq); } diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index 23c75755ad4e..3e804da1e1eb 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -83,14 +83,12 @@ static void netfs_unlock_read_folio(struct netfs_io_request *rreq, } just_unlock: - if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { - if (folio->index == rreq->no_unlock_folio && - test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) { - _debug("no unlock"); - } else { - trace_netfs_folio(folio, netfs_folio_trace_read_unlock); - folio_unlock(folio); - } + if (folio->index == rreq->no_unlock_folio && + test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) { + _debug("no unlock"); + } else { + trace_netfs_folio(folio, netfs_folio_trace_read_unlock); + folio_unlock(folio); } folioq_clear(folioq, slot); @@ -220,7 +218,7 @@ reassess: stream->collected_to = front->start; } - if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags)) + if (netfs_check_subreq_in_progress(front)) notes |= HIT_PENDING; smp_rmb(); /* Read counters after IN_PROGRESS flag. */ transferred = READ_ONCE(front->transferred); @@ -280,9 +278,13 @@ reassess: stream->need_retry = true; notes |= NEED_RETRY | MADE_PROGRESS; break; + } else if (test_bit(NETFS_RREQ_SHORT_TRANSFER, &rreq->flags)) { + notes |= MADE_PROGRESS; } else { if (!stream->failed) - stream->transferred = stream->collected_to - rreq->start; + stream->transferred += transferred; + if (front->transferred < front->len) + set_bit(NETFS_RREQ_SHORT_TRANSFER, &rreq->flags); notes |= MADE_PROGRESS; } @@ -291,13 +293,15 @@ reassess: spin_lock(&rreq->lock); remove = front; - trace_netfs_sreq(front, netfs_sreq_trace_discard); + trace_netfs_sreq(front, + notes & ABANDON_SREQ ? + netfs_sreq_trace_abandoned : netfs_sreq_trace_consumed); list_del_init(&front->rreq_link); front = list_first_entry_or_null(&stream->subrequests, struct netfs_io_subrequest, rreq_link); stream->front = front; spin_unlock(&rreq->lock); - netfs_put_subrequest(remove, false, + netfs_put_subrequest(remove, notes & ABANDON_SREQ ? netfs_sreq_trace_put_abandon : netfs_sreq_trace_put_done); @@ -311,14 +315,8 @@ reassess: if (notes & NEED_RETRY) goto need_retry; - if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) { - trace_netfs_rreq(rreq, netfs_rreq_trace_unpause); - clear_bit_unlock(NETFS_RREQ_PAUSE, &rreq->flags); - smp_mb__after_atomic(); /* Set PAUSE before task state */ - wake_up(&rreq->waitq); - } - if (notes & MADE_PROGRESS) { + netfs_wake_rreq_flag(rreq, NETFS_RREQ_PAUSE, netfs_rreq_trace_unpause); //cond_resched(); goto reassess; } @@ -342,24 +340,10 @@ need_retry: */ static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) { - struct netfs_io_subrequest *subreq; - struct netfs_io_stream *stream = &rreq->io_streams[0]; unsigned int i; - /* Collect unbuffered reads and direct reads, adding up the transfer - * sizes until we find the first short or failed subrequest. - */ - list_for_each_entry(subreq, &stream->subrequests, rreq_link) { - rreq->transferred += subreq->transferred; - - if (subreq->transferred < subreq->len || - test_bit(NETFS_SREQ_FAILED, &subreq->flags)) { - rreq->error = subreq->error; - break; - } - } - - if (rreq->origin == NETFS_DIO_READ) { + if (rreq->origin == NETFS_UNBUFFERED_READ || + rreq->origin == NETFS_DIO_READ) { for (i = 0; i < rreq->direct_bv_count; i++) { flush_dcache_page(rreq->direct_bv[i].bv_page); // TODO: cifs marks pages in the destination buffer @@ -371,13 +355,16 @@ static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) if (rreq->iocb) { rreq->iocb->ki_pos += rreq->transferred; - if (rreq->iocb->ki_complete) + if (rreq->iocb->ki_complete) { + trace_netfs_rreq(rreq, netfs_rreq_trace_ki_complete); rreq->iocb->ki_complete( rreq->iocb, rreq->error ? rreq->error : rreq->transferred); + } } if (rreq->netfs_ops->done) rreq->netfs_ops->done(rreq); - if (rreq->origin == NETFS_DIO_READ) + if (rreq->origin == NETFS_UNBUFFERED_READ || + rreq->origin == NETFS_DIO_READ) inode_dio_end(rreq->inode); } @@ -396,9 +383,11 @@ static void netfs_rreq_assess_single(struct netfs_io_request *rreq) if (rreq->iocb) { rreq->iocb->ki_pos += rreq->transferred; - if (rreq->iocb->ki_complete) + if (rreq->iocb->ki_complete) { + trace_netfs_rreq(rreq, netfs_rreq_trace_ki_complete); rreq->iocb->ki_complete( rreq->iocb, rreq->error ? rreq->error : rreq->transferred); + } } if (rreq->netfs_ops->done) rreq->netfs_ops->done(rreq); @@ -410,7 +399,7 @@ static void netfs_rreq_assess_single(struct netfs_io_request *rreq) * Note that we're in normal kernel thread context at this point, possibly * running on a workqueue. */ -static void netfs_read_collection(struct netfs_io_request *rreq) +bool netfs_read_collection(struct netfs_io_request *rreq) { struct netfs_io_stream *stream = &rreq->io_streams[0]; @@ -420,11 +409,11 @@ static void netfs_read_collection(struct netfs_io_request *rreq) * queue is empty. */ if (!test_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags)) - return; + return false; smp_rmb(); /* Read ALL_QUEUED before subreq lists. */ if (!list_empty(&stream->subrequests)) - return; + return false; /* Okay, declare that all I/O is complete. */ rreq->transferred = stream->transferred; @@ -433,6 +422,7 @@ static void netfs_read_collection(struct netfs_io_request *rreq) //netfs_rreq_is_still_valid(rreq); switch (rreq->origin) { + case NETFS_UNBUFFERED_READ: case NETFS_DIO_READ: case NETFS_READ_GAPS: netfs_rreq_assess_dio(rreq); @@ -445,14 +435,15 @@ static void netfs_read_collection(struct netfs_io_request *rreq) } task_io_account_read(rreq->transferred); - trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip); - clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); + netfs_wake_rreq_flag(rreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip); + /* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */ trace_netfs_rreq(rreq, netfs_rreq_trace_done); - netfs_clear_subrequests(rreq, false); + netfs_clear_subrequests(rreq); netfs_unlock_abandoned_read_pages(rreq); if (unlikely(rreq->copy_to_cache)) netfs_pgpriv2_end_copy_to_cache(rreq); + return true; } void netfs_read_collection_worker(struct work_struct *work) @@ -460,26 +451,12 @@ void netfs_read_collection_worker(struct work_struct *work) struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); netfs_see_request(rreq, netfs_rreq_trace_see_work); - if (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) - netfs_read_collection(rreq); - netfs_put_request(rreq, false, netfs_rreq_trace_put_work); -} - -/* - * Wake the collection work item. - */ -void netfs_wake_read_collector(struct netfs_io_request *rreq) -{ - if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) && - !test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) { - if (!work_pending(&rreq->work)) { - netfs_get_request(rreq, netfs_rreq_trace_get_work); - if (!queue_work(system_unbound_wq, &rreq->work)) - netfs_put_request(rreq, true, netfs_rreq_trace_put_work_nq); - } - } else { - trace_netfs_rreq(rreq, netfs_rreq_trace_wake_queue); - wake_up(&rreq->waitq); + if (netfs_check_rreq_in_progress(rreq)) { + if (netfs_read_collection(rreq)) + /* Drop the ref from the IN_PROGRESS flag. */ + netfs_put_request(rreq, netfs_rreq_trace_put_work_ip); + else + netfs_see_request(rreq, netfs_rreq_trace_see_work_complete); } } @@ -511,7 +488,7 @@ void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq) list_is_first(&subreq->rreq_link, &stream->subrequests) ) { __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); - netfs_wake_read_collector(rreq); + netfs_wake_collector(rreq); } } EXPORT_SYMBOL(netfs_read_subreq_progress); @@ -535,7 +512,6 @@ EXPORT_SYMBOL(netfs_read_subreq_progress); void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; - struct netfs_io_stream *stream = &rreq->io_streams[0]; switch (subreq->source) { case NETFS_READ_FROM_CACHE: @@ -582,23 +558,15 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq) } trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); - - clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags); - smp_mb__after_atomic(); /* Clear IN_PROGRESS before task state */ - - /* If we are at the head of the queue, wake up the collector. */ - if (list_is_first(&subreq->rreq_link, &stream->subrequests) || - test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) - netfs_wake_read_collector(rreq); - - netfs_put_subrequest(subreq, true, netfs_sreq_trace_put_terminated); + netfs_subreq_clear_in_progress(subreq); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_terminated); } EXPORT_SYMBOL(netfs_read_subreq_terminated); /* * Handle termination of a read from the cache. */ -void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async) +void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error) { struct netfs_io_subrequest *subreq = priv; @@ -613,94 +581,3 @@ void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool } netfs_read_subreq_terminated(subreq); } - -/* - * Wait for the read operation to complete, successfully or otherwise. - */ -ssize_t netfs_wait_for_read(struct netfs_io_request *rreq) -{ - struct netfs_io_subrequest *subreq; - struct netfs_io_stream *stream = &rreq->io_streams[0]; - DEFINE_WAIT(myself); - ssize_t ret; - - for (;;) { - trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); - prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); - - subreq = list_first_entry_or_null(&stream->subrequests, - struct netfs_io_subrequest, rreq_link); - if (subreq && - (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) || - test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) { - __set_current_state(TASK_RUNNING); - netfs_read_collection(rreq); - continue; - } - - if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) - break; - - schedule(); - trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); - } - - finish_wait(&rreq->waitq, &myself); - - ret = rreq->error; - if (ret == 0) { - ret = rreq->transferred; - switch (rreq->origin) { - case NETFS_DIO_READ: - case NETFS_READ_SINGLE: - ret = rreq->transferred; - break; - default: - if (rreq->submitted < rreq->len) { - trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); - ret = -EIO; - } - break; - } - } - - return ret; -} - -/* - * Wait for a paused read operation to unpause or complete in some manner. - */ -void netfs_wait_for_pause(struct netfs_io_request *rreq) -{ - struct netfs_io_subrequest *subreq; - struct netfs_io_stream *stream = &rreq->io_streams[0]; - DEFINE_WAIT(myself); - - trace_netfs_rreq(rreq, netfs_rreq_trace_wait_pause); - - for (;;) { - trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); - prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); - - if (!test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) { - subreq = list_first_entry_or_null(&stream->subrequests, - struct netfs_io_subrequest, rreq_link); - if (subreq && - (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) || - test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) { - __set_current_state(TASK_RUNNING); - netfs_read_collection(rreq); - continue; - } - } - - if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags) || - !test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) - break; - - schedule(); - trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); - } - - finish_wait(&rreq->waitq, &myself); -} diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c index cf7727060215..8097bc069c1d 100644 --- a/fs/netfs/read_pgpriv2.c +++ b/fs/netfs/read_pgpriv2.c @@ -110,13 +110,15 @@ static struct netfs_io_request *netfs_pgpriv2_begin_copy_to_cache( if (!creq->io_streams[1].avail) goto cancel_put; + __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &creq->flags); + trace_netfs_copy2cache(rreq, creq); trace_netfs_write(creq, netfs_write_trace_copy_to_cache); netfs_stat(&netfs_n_wh_copy_to_cache); rreq->copy_to_cache = creq; return creq; cancel_put: - netfs_put_request(creq, false, netfs_rreq_trace_put_return); + netfs_put_request(creq, netfs_rreq_trace_put_return); cancel: rreq->copy_to_cache = ERR_PTR(-ENOBUFS); clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags); @@ -154,8 +156,11 @@ void netfs_pgpriv2_end_copy_to_cache(struct netfs_io_request *rreq) netfs_issue_write(creq, &creq->io_streams[1]); smp_wmb(); /* Write lists before ALL_QUEUED. */ set_bit(NETFS_RREQ_ALL_QUEUED, &creq->flags); + trace_netfs_rreq(rreq, netfs_rreq_trace_end_copy_to_cache); + if (list_empty_careful(&creq->io_streams[1].subrequests)) + netfs_wake_collector(creq); - netfs_put_request(creq, false, netfs_rreq_trace_put_return); + netfs_put_request(creq, netfs_rreq_trace_put_return); creq->copy_to_cache = NULL; } diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index 0f294b26e08c..b99e84a8170a 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -173,7 +173,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) &stream->subrequests, rreq_link) { trace_netfs_sreq(subreq, netfs_sreq_trace_superfluous); list_del(&subreq->rreq_link); - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_done); if (subreq == to) break; } @@ -257,35 +257,15 @@ abandon: */ void netfs_retry_reads(struct netfs_io_request *rreq) { - struct netfs_io_subrequest *subreq; struct netfs_io_stream *stream = &rreq->io_streams[0]; - DEFINE_WAIT(myself); netfs_stat(&netfs_n_rh_retry_read_req); - set_bit(NETFS_RREQ_RETRYING, &rreq->flags); - /* Wait for all outstanding I/O to quiesce before performing retries as * we may need to renegotiate the I/O sizes. */ - list_for_each_entry(subreq, &stream->subrequests, rreq_link) { - if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) - continue; - - trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); - for (;;) { - prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); - - if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) - break; - - trace_netfs_sreq(subreq, netfs_sreq_trace_wait_for); - schedule(); - trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); - } - - finish_wait(&rreq->waitq, &myself); - } + set_bit(NETFS_RREQ_RETRYING, &rreq->flags); + netfs_wait_for_in_progress_stream(rreq, stream); clear_bit(NETFS_RREQ_RETRYING, &rreq->flags); trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c index fea0ecdecc53..fa622a6cd56d 100644 --- a/fs/netfs/read_single.c +++ b/fs/netfs/read_single.c @@ -142,7 +142,7 @@ static int netfs_single_dispatch_read(struct netfs_io_request *rreq) set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); return ret; cancel: - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); return ret; } @@ -185,11 +185,11 @@ ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_ite netfs_single_dispatch_read(rreq); ret = netfs_wait_for_read(rreq); - netfs_put_request(rreq, true, netfs_rreq_trace_put_return); + netfs_put_request(rreq, netfs_rreq_trace_put_return); return ret; cleanup_free: - netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); + netfs_put_request(rreq, netfs_rreq_trace_put_failed); return ret; } EXPORT_SYMBOL(netfs_read_single); diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 3fca59e6475d..0f3a36852a4d 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -240,7 +240,7 @@ reassess_streams: } /* Stall if the front is still undergoing I/O. */ - if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags)) { + if (netfs_check_subreq_in_progress(front)) { notes |= HIT_PENDING; break; } @@ -280,7 +280,7 @@ reassess_streams: struct netfs_io_subrequest, rreq_link); stream->front = front; spin_unlock(&wreq->lock); - netfs_put_subrequest(remove, false, + netfs_put_subrequest(remove, notes & SAW_FAILURE ? netfs_sreq_trace_put_cancel : netfs_sreq_trace_put_done); @@ -321,18 +321,14 @@ reassess_streams: if (notes & NEED_RETRY) goto need_retry; - if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) { - trace_netfs_rreq(wreq, netfs_rreq_trace_unpause); - clear_bit_unlock(NETFS_RREQ_PAUSE, &wreq->flags); - smp_mb__after_atomic(); /* Set PAUSE before task state */ - wake_up(&wreq->waitq); - } - if (notes & NEED_REASSESS) { + if (notes & MADE_PROGRESS) { + netfs_wake_rreq_flag(wreq, NETFS_RREQ_PAUSE, netfs_rreq_trace_unpause); //cond_resched(); goto reassess_streams; } - if (notes & MADE_PROGRESS) { + + if (notes & NEED_REASSESS) { //cond_resched(); goto reassess_streams; } @@ -356,30 +352,21 @@ need_retry: /* * Perform the collection of subrequests, folios and encryption buffers. */ -void netfs_write_collection_worker(struct work_struct *work) +bool netfs_write_collection(struct netfs_io_request *wreq) { - struct netfs_io_request *wreq = container_of(work, struct netfs_io_request, work); struct netfs_inode *ictx = netfs_inode(wreq->inode); size_t transferred; int s; _enter("R=%x", wreq->debug_id); - netfs_see_request(wreq, netfs_rreq_trace_see_work); - if (!test_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags)) { - netfs_put_request(wreq, false, netfs_rreq_trace_put_work); - return; - } - netfs_collect_write_results(wreq); /* We're done when the app thread has finished posting subreqs and all * the queues in all the streams are empty. */ - if (!test_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags)) { - netfs_put_request(wreq, false, netfs_rreq_trace_put_work); - return; - } + if (!test_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags)) + return false; smp_rmb(); /* Read ALL_QUEUED before lists. */ transferred = LONG_MAX; @@ -387,10 +374,8 @@ void netfs_write_collection_worker(struct work_struct *work) struct netfs_io_stream *stream = &wreq->io_streams[s]; if (!stream->active) continue; - if (!list_empty(&stream->subrequests)) { - netfs_put_request(wreq, false, netfs_rreq_trace_put_work); - return; - } + if (!list_empty(&stream->subrequests)) + return false; if (stream->transferred < transferred) transferred = stream->transferred; } @@ -408,8 +393,10 @@ void netfs_write_collection_worker(struct work_struct *work) ictx->ops->invalidate_cache(wreq); } - if (wreq->cleanup) - wreq->cleanup(wreq); + if ((wreq->origin == NETFS_UNBUFFERED_WRITE || + wreq->origin == NETFS_DIO_WRITE) && + !wreq->error) + netfs_update_i_size(ictx, &ictx->inode, wreq->start, wreq->transferred); if (wreq->origin == NETFS_DIO_WRITE && wreq->mapping->nrpages) { @@ -428,31 +415,35 @@ void netfs_write_collection_worker(struct work_struct *work) inode_dio_end(wreq->inode); _debug("finished"); - trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip); - clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags); + netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip); + /* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */ if (wreq->iocb) { size_t written = min(wreq->transferred, wreq->len); wreq->iocb->ki_pos += written; - if (wreq->iocb->ki_complete) + if (wreq->iocb->ki_complete) { + trace_netfs_rreq(wreq, netfs_rreq_trace_ki_complete); wreq->iocb->ki_complete( wreq->iocb, wreq->error ? wreq->error : written); + } wreq->iocb = VFS_PTR_POISON; } - netfs_clear_subrequests(wreq, false); - netfs_put_request(wreq, false, netfs_rreq_trace_put_work_complete); + netfs_clear_subrequests(wreq); + return true; } -/* - * Wake the collection work item. - */ -void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async) +void netfs_write_collection_worker(struct work_struct *work) { - if (!work_pending(&wreq->work)) { - netfs_get_request(wreq, netfs_rreq_trace_get_work); - if (!queue_work(system_unbound_wq, &wreq->work)) - netfs_put_request(wreq, was_async, netfs_rreq_trace_put_work_nq); + struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); + + netfs_see_request(rreq, netfs_rreq_trace_see_work); + if (netfs_check_rreq_in_progress(rreq)) { + if (netfs_write_collection(rreq)) + /* Drop the ref from the IN_PROGRESS flag. */ + netfs_put_request(rreq, netfs_rreq_trace_put_work_ip); + else + netfs_see_request(rreq, netfs_rreq_trace_see_work_complete); } } @@ -460,7 +451,6 @@ void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async) * netfs_write_subrequest_terminated - Note the termination of a write operation. * @_op: The I/O request that has terminated. * @transferred_or_error: The amount of data transferred or an error code. - * @was_async: The termination was asynchronous * * This tells the library that a contributory write I/O operation has * terminated, one way or another, and that it should collect the results. @@ -470,21 +460,16 @@ void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async) * negative error code. The library will look after reissuing I/O operations * as appropriate and writing downloaded data to the cache. * - * If @was_async is true, the caller might be running in softirq or interrupt - * context and we can't sleep. - * * When this is called, ownership of the subrequest is transferred back to the * library, along with a ref. * * Note that %_op is a void* so that the function can be passed to * kiocb::term_func without the need for a casting wrapper. */ -void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, - bool was_async) +void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error) { struct netfs_io_subrequest *subreq = _op; struct netfs_io_request *wreq = subreq->rreq; - struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr]; _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error); @@ -495,8 +480,6 @@ void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, case NETFS_WRITE_TO_CACHE: netfs_stat(&netfs_n_wh_write_done); break; - case NETFS_INVALID_WRITE: - break; default: BUG(); } @@ -536,15 +519,7 @@ void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, } trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); - - clear_and_wake_up_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); - - /* If we are at the head of the queue, wake up the collector, - * transferring a ref to it if we were the ones to do so. - */ - if (list_is_first(&subreq->rreq_link, &stream->subrequests)) - netfs_wake_write_collector(wreq, was_async); - - netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); + netfs_subreq_clear_in_progress(subreq); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_terminated); } EXPORT_SYMBOL(netfs_write_subrequest_terminated); diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 77279fc5b5a7..50bee2c4130d 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -134,7 +134,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, return wreq; nomem: wreq->error = -ENOMEM; - netfs_put_request(wreq, false, netfs_rreq_trace_put_failed); + netfs_put_request(wreq, netfs_rreq_trace_put_failed); return ERR_PTR(-ENOMEM); } @@ -233,7 +233,7 @@ static void netfs_do_issue_write(struct netfs_io_stream *stream, _enter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len); if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) - return netfs_write_subrequest_terminated(subreq, subreq->error, false); + return netfs_write_subrequest_terminated(subreq, subreq->error); trace_netfs_sreq(subreq, netfs_sreq_trace_submit); stream->issue_write(subreq); @@ -542,7 +542,7 @@ static void netfs_end_issue_write(struct netfs_io_request *wreq) } if (needs_poke) - netfs_wake_write_collector(wreq, false); + netfs_wake_collector(wreq); } /* @@ -576,6 +576,7 @@ int netfs_writepages(struct address_space *mapping, goto couldnt_start; } + __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags); trace_netfs_write(wreq, netfs_write_trace_writeback); netfs_stat(&netfs_n_wh_writepages); @@ -599,8 +600,9 @@ int netfs_writepages(struct address_space *mapping, netfs_end_issue_write(wreq); mutex_unlock(&ictx->wb_lock); + netfs_wake_collector(wreq); - netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + netfs_put_request(wreq, netfs_rreq_trace_put_return); _leave(" = %d", error); return error; @@ -673,11 +675,11 @@ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_c /* * End a write operation used when writing through the pagecache. */ -int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, - struct folio *writethrough_cache) +ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, + struct folio *writethrough_cache) { struct netfs_inode *ictx = netfs_inode(wreq->inode); - int ret; + ssize_t ret; _enter("R=%x", wreq->debug_id); @@ -688,13 +690,11 @@ int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_contr mutex_unlock(&ictx->wb_lock); - if (wreq->iocb) { + if (wreq->iocb) ret = -EIOCBQUEUED; - } else { - wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE); - ret = wreq->error; - } - netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + else + ret = netfs_wait_for_write(wreq); + netfs_put_request(wreq, netfs_rreq_trace_put_return); return ret; } @@ -722,10 +722,8 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t start += part; len -= part; rolling_buffer_advance(&wreq->buffer, part); - if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) { - trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause); - wait_event(wreq->waitq, !test_bit(NETFS_RREQ_PAUSE, &wreq->flags)); - } + if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) + netfs_wait_for_paused_write(wreq); if (test_bit(NETFS_RREQ_FAILED, &wreq->flags)) break; } @@ -885,7 +883,8 @@ int netfs_writeback_single(struct address_space *mapping, goto couldnt_start; } - trace_netfs_write(wreq, netfs_write_trace_writeback); + __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags); + trace_netfs_write(wreq, netfs_write_trace_writeback_single); netfs_stat(&netfs_n_wh_writepages); if (__test_and_set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags)) @@ -914,8 +913,9 @@ stop: set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags); mutex_unlock(&ictx->wb_lock); + netfs_wake_collector(wreq); - netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + netfs_put_request(wreq, netfs_rreq_trace_put_return); _leave(" = %d", ret); return ret; diff --git a/fs/netfs/write_retry.c b/fs/netfs/write_retry.c index 545d33079a77..fc9c3e0d34d8 100644 --- a/fs/netfs/write_retry.c +++ b/fs/netfs/write_retry.c @@ -39,9 +39,10 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) break; if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) { - struct iov_iter source = subreq->io_iter; + struct iov_iter source; - iov_iter_revert(&source, subreq->len - source.count); + netfs_reset_iter(subreq); + source = subreq->io_iter; netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); netfs_reissue_write(stream, subreq, &source); } @@ -131,7 +132,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, &stream->subrequests, rreq_link) { trace_netfs_sreq(subreq, netfs_sreq_trace_discard); list_del(&subreq->rreq_link); - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_done); if (subreq == to) break; } @@ -145,14 +146,13 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, subreq = netfs_alloc_subrequest(wreq); subreq->source = to->source; subreq->start = start; - subreq->debug_index = atomic_inc_return(&wreq->subreq_counter); subreq->stream_nr = to->stream_nr; subreq->retry_count = 1; trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index, refcount_read(&subreq->ref), netfs_sreq_trace_new); - netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); + trace_netfs_sreq(subreq, netfs_sreq_trace_split); list_add(&subreq->rreq_link, &to->rreq_link); to = list_next_entry(to, rreq_link); @@ -199,7 +199,6 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, */ void netfs_retry_writes(struct netfs_io_request *wreq) { - struct netfs_io_subrequest *subreq; struct netfs_io_stream *stream; int s; @@ -208,16 +207,13 @@ void netfs_retry_writes(struct netfs_io_request *wreq) /* Wait for all outstanding I/O to quiesce before performing retries as * we may need to renegotiate the I/O sizes. */ + set_bit(NETFS_RREQ_RETRYING, &wreq->flags); for (s = 0; s < NR_IO_STREAMS; s++) { stream = &wreq->io_streams[s]; - if (!stream->active) - continue; - - list_for_each_entry(subreq, &stream->subrequests, rreq_link) { - wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS, - TASK_UNINTERRUPTIBLE); - } + if (stream->active) + netfs_wait_for_in_progress_stream(wreq, stream); } + clear_bit(NETFS_RREQ_RETRYING, &wreq->flags); // TODO: Enc: Fetch changed partial pages // TODO: Enc: Reencrypt content if needed. diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c index d8d50a88de04..d526f5ba7887 100644 --- a/fs/nfs/blocklayout/rpc_pipefs.c +++ b/fs/nfs/blocklayout/rpc_pipefs.c @@ -141,24 +141,18 @@ static const struct rpc_pipe_ops bl_upcall_ops = { .destroy_msg = bl_pipe_destroy_msg, }; -static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb, +static int nfs4blocklayout_register_sb(struct super_block *sb, struct rpc_pipe *pipe) { - struct dentry *dir, *dentry; + struct dentry *dir; + int err; dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME); if (dir == NULL) - return ERR_PTR(-ENOENT); - dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe); + return -ENOENT; + err = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe); dput(dir); - return dentry; -} - -static void nfs4blocklayout_unregister_sb(struct super_block *sb, - struct rpc_pipe *pipe) -{ - if (pipe->dentry) - rpc_unlink(pipe->dentry); + return err; } static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, @@ -167,7 +161,6 @@ static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, struct super_block *sb = ptr; struct net *net = sb->s_fs_info; struct nfs_net *nn = net_generic(net, nfs_net_id); - struct dentry *dentry; int ret = 0; if (!try_module_get(THIS_MODULE)) @@ -180,16 +173,10 @@ static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, switch (event) { case RPC_PIPEFS_MOUNT: - dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe); - if (IS_ERR(dentry)) { - ret = PTR_ERR(dentry); - break; - } - nn->bl_device_pipe->dentry = dentry; + ret = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe); break; case RPC_PIPEFS_UMOUNT: - if (nn->bl_device_pipe->dentry) - nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe); + rpc_unlink(nn->bl_device_pipe); break; default: ret = -ENOTSUPP; @@ -203,18 +190,17 @@ static struct notifier_block nfs4blocklayout_block = { .notifier_call = rpc_pipefs_event, }; -static struct dentry *nfs4blocklayout_register_net(struct net *net, - struct rpc_pipe *pipe) +static int nfs4blocklayout_register_net(struct net *net, struct rpc_pipe *pipe) { struct super_block *pipefs_sb; - struct dentry *dentry; + int ret; pipefs_sb = rpc_get_sb_net(net); if (!pipefs_sb) - return NULL; - dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe); + return 0; + ret = nfs4blocklayout_register_sb(pipefs_sb, pipe); rpc_put_sb_net(net); - return dentry; + return ret; } static void nfs4blocklayout_unregister_net(struct net *net, @@ -224,7 +210,7 @@ static void nfs4blocklayout_unregister_net(struct net *net, pipefs_sb = rpc_get_sb_net(net); if (pipefs_sb) { - nfs4blocklayout_unregister_sb(pipefs_sb, pipe); + rpc_unlink(pipe); rpc_put_sb_net(net); } } @@ -232,20 +218,17 @@ static void nfs4blocklayout_unregister_net(struct net *net, static int nfs4blocklayout_net_init(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); - struct dentry *dentry; + int err; mutex_init(&nn->bl_mutex); init_waitqueue_head(&nn->bl_wq); nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0); if (IS_ERR(nn->bl_device_pipe)) return PTR_ERR(nn->bl_device_pipe); - dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe); - if (IS_ERR(dentry)) { + err = nfs4blocklayout_register_net(net, nn->bl_device_pipe); + if (unlikely(err)) rpc_destroy_pipe_data(nn->bl_device_pipe); - return PTR_ERR(dentry); - } - nn->bl_device_pipe->dentry = dentry; - return 0; + return err; } static void nfs4blocklayout_net_exit(struct net *net) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 6d63b958c4bb..cf35ad3f818a 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -180,7 +180,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_proto = cl_init->proto; clp->cl_nconnect = cl_init->nconnect; clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1; - clp->cl_net = get_net(cl_init->net); + clp->cl_net = get_net_track(cl_init->net, &clp->cl_ns_tracker, GFP_KERNEL); #if IS_ENABLED(CONFIG_NFS_LOCALIO) seqlock_init(&clp->cl_boot_lock); @@ -250,7 +250,7 @@ void nfs_free_client(struct nfs_client *clp) if (!IS_ERR(clp->cl_rpcclient)) rpc_shutdown_client(clp->cl_rpcclient); - put_net(clp->cl_net); + put_net_track(clp->cl_net, &clp->cl_ns_tracker); put_nfs_version(clp->cl_nfs_mod); kfree(clp->cl_hostname); kfree(clp->cl_acceptor); @@ -439,7 +439,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) spin_unlock(&nn->nfs_client_lock); new = rpc_ops->init_client(new, cl_init); if (!IS_ERR(new)) - nfs_local_probe(new); + nfs_local_probe_async(new); return new; } diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 8bdbc4dca89c..10ef46e29b25 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -1021,13 +1021,6 @@ out: nfs_inode_find_state_and_recover(inode, stateid); } -void nfs_remove_bad_delegation(struct inode *inode, - const nfs4_stateid *stateid) -{ - nfs_revoke_delegation(inode, stateid); -} -EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); - void nfs_delegation_mark_returned(struct inode *inode, const nfs4_stateid *stateid) { @@ -1070,6 +1063,24 @@ out_rcu_unlock: } /** + * nfs_remove_bad_delegation - handle delegations that are unusable + * @inode: inode to process + * @stateid: the delegation's stateid + * + * If the server ACK-ed our FREE_STATEID then clean + * up the delegation, else mark and keep the revoked state. + */ +void nfs_remove_bad_delegation(struct inode *inode, + const nfs4_stateid *stateid) +{ + if (stateid && stateid->type == NFS4_FREED_STATEID_TYPE) + nfs_delegation_mark_returned(inode, stateid); + else + nfs_revoke_delegation(inode, stateid); +} +EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); + +/** * nfs_expire_unused_delegation_types * @clp: client to process * @flags: delegation types to expire diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 033feeab8c34..86e36c630f09 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -207,24 +207,25 @@ nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe EXPORT_SYMBOL_GPL(nfs_file_splice_read); int -nfs_file_mmap(struct file *file, struct vm_area_struct *vma) +nfs_file_mmap_prepare(struct vm_area_desc *desc) { + struct file *file = desc->file; struct inode *inode = file_inode(file); int status; dprintk("NFS: mmap(%pD2)\n", file); - /* Note: generic_file_mmap() returns ENOSYS on nommu systems + /* Note: generic_file_mmap_prepare() returns ENOSYS on nommu systems * so we call that before revalidating the mapping */ - status = generic_file_mmap(file, vma); + status = generic_file_mmap_prepare(desc); if (!status) { - vma->vm_ops = &nfs_file_vm_ops; + desc->vm_ops = &nfs_file_vm_ops; status = nfs_revalidate_mapping(inode, file->f_mapping); } return status; } -EXPORT_SYMBOL_GPL(nfs_file_mmap); +EXPORT_SYMBOL_GPL(nfs_file_mmap_prepare); /* * Flush any dirty pages for this process, and check for write errors. @@ -342,12 +343,14 @@ static bool nfs_want_read_modify_write(struct file *file, struct folio *folio, * If the writer ends up delaying the write, the writer needs to * increment the page use counts until he is done with the page. */ -static int nfs_write_begin(struct file *file, struct address_space *mapping, +static int nfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { fgf_t fgp = FGP_WRITEBEGIN; struct folio *folio; + struct file *file = iocb->ki_filp; int once_thru = 0; int ret; @@ -377,10 +380,12 @@ start: return ret; } -static int nfs_write_end(struct file *file, struct address_space *mapping, +static int nfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { + struct file *file = iocb->ki_filp; struct nfs_open_context *ctx = nfs_file_open_context(file); unsigned offset = offset_in_folio(folio, pos); int status; @@ -899,7 +904,7 @@ const struct file_operations nfs_file_operations = { .llseek = nfs_file_llseek, .read_iter = nfs_file_read, .write_iter = nfs_file_write, - .mmap = nfs_file_mmap, + .mmap_prepare = nfs_file_mmap_prepare, .open = nfs_file_open, .flush = nfs_file_flush, .release = nfs_file_release, diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index e6909cafab68..4bea008dbebd 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1105,6 +1105,7 @@ static void ff_layout_reset_read(struct nfs_pgio_header *hdr) } static int ff_layout_async_handle_error_v4(struct rpc_task *task, + u32 op_status, struct nfs4_state *state, struct nfs_client *clp, struct pnfs_layout_segment *lseg, @@ -1115,32 +1116,42 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table; - switch (task->tk_status) { - case -NFS4ERR_BADSESSION: - case -NFS4ERR_BADSLOT: - case -NFS4ERR_BAD_HIGH_SLOT: - case -NFS4ERR_DEADSESSION: - case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - case -NFS4ERR_SEQ_FALSE_RETRY: - case -NFS4ERR_SEQ_MISORDERED: + switch (op_status) { + case NFS4_OK: + case NFS4ERR_NXIO: + break; + case NFSERR_PERM: + if (!task->tk_xprt) + break; + xprt_force_disconnect(task->tk_xprt); + goto out_retry; + case NFS4ERR_BADSESSION: + case NFS4ERR_BADSLOT: + case NFS4ERR_BAD_HIGH_SLOT: + case NFS4ERR_DEADSESSION: + case NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case NFS4ERR_SEQ_FALSE_RETRY: + case NFS4ERR_SEQ_MISORDERED: dprintk("%s ERROR %d, Reset session. Exchangeid " "flags 0x%x\n", __func__, task->tk_status, clp->cl_exchange_flags); nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); - break; - case -NFS4ERR_DELAY: - case -NFS4ERR_GRACE: + goto out_retry; + case NFS4ERR_DELAY: + nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); + fallthrough; + case NFS4ERR_GRACE: rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX); - break; - case -NFS4ERR_RETRY_UNCACHED_REP: - break; + goto out_retry; + case NFS4ERR_RETRY_UNCACHED_REP: + goto out_retry; /* Invalidate Layout errors */ - case -NFS4ERR_PNFS_NO_LAYOUT: - case -ESTALE: /* mapped NFS4ERR_STALE */ - case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */ - case -EISDIR: /* mapped NFS4ERR_ISDIR */ - case -NFS4ERR_FHEXPIRED: - case -NFS4ERR_WRONG_TYPE: + case NFS4ERR_PNFS_NO_LAYOUT: + case NFS4ERR_STALE: + case NFS4ERR_BADHANDLE: + case NFS4ERR_ISDIR: + case NFS4ERR_FHEXPIRED: + case NFS4ERR_WRONG_TYPE: dprintk("%s Invalid layout error %d\n", __func__, task->tk_status); /* @@ -1153,6 +1164,11 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, pnfs_destroy_layout(NFS_I(inode)); rpc_wake_up(&tbl->slot_tbl_waitq); goto reset; + default: + break; + } + + switch (task->tk_status) { /* RPC connection errors */ case -ENETDOWN: case -ENETUNREACH: @@ -1172,27 +1188,56 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid); rpc_wake_up(&tbl->slot_tbl_waitq); - fallthrough; + break; default: - if (ff_layout_avoid_mds_available_ds(lseg)) - return -NFS4ERR_RESET_TO_PNFS; -reset: - dprintk("%s Retry through MDS. Error %d\n", __func__, - task->tk_status); - return -NFS4ERR_RESET_TO_MDS; + break; } + + if (ff_layout_avoid_mds_available_ds(lseg)) + return -NFS4ERR_RESET_TO_PNFS; +reset: + dprintk("%s Retry through MDS. Error %d\n", __func__, + task->tk_status); + return -NFS4ERR_RESET_TO_MDS; + +out_retry: task->tk_status = 0; return -EAGAIN; } /* Retry all errors through either pNFS or MDS except for -EJUKEBOX */ static int ff_layout_async_handle_error_v3(struct rpc_task *task, + u32 op_status, struct nfs_client *clp, struct pnfs_layout_segment *lseg, u32 idx) { struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + switch (op_status) { + case NFS_OK: + case NFSERR_NXIO: + break; + case NFSERR_PERM: + if (!task->tk_xprt) + break; + xprt_force_disconnect(task->tk_xprt); + goto out_retry; + case NFSERR_ACCES: + case NFSERR_BADHANDLE: + case NFSERR_FBIG: + case NFSERR_IO: + case NFSERR_NOSPC: + case NFSERR_ROFS: + case NFSERR_STALE: + goto out_reset_to_pnfs; + case NFSERR_JUKEBOX: + nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); + goto out_retry; + default: + break; + } + switch (task->tk_status) { /* File access problems. Don't mark the device as unavailable */ case -EACCES: @@ -1216,6 +1261,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid); } +out_reset_to_pnfs: /* FIXME: Need to prevent infinite looping here. */ return -NFS4ERR_RESET_TO_PNFS; out_retry: @@ -1226,6 +1272,7 @@ out_retry: } static int ff_layout_async_handle_error(struct rpc_task *task, + u32 op_status, struct nfs4_state *state, struct nfs_client *clp, struct pnfs_layout_segment *lseg, @@ -1244,10 +1291,11 @@ static int ff_layout_async_handle_error(struct rpc_task *task, switch (vers) { case 3: - return ff_layout_async_handle_error_v3(task, clp, lseg, idx); - case 4: - return ff_layout_async_handle_error_v4(task, state, clp, + return ff_layout_async_handle_error_v3(task, op_status, clp, lseg, idx); + case 4: + return ff_layout_async_handle_error_v4(task, op_status, state, + clp, lseg, idx); default: /* should never happen */ WARN_ON_ONCE(1); @@ -1300,6 +1348,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, switch (status) { case NFS4ERR_DELAY: case NFS4ERR_GRACE: + case NFS4ERR_PERM: break; case NFS4ERR_NXIO: ff_layout_mark_ds_unreachable(lseg, idx); @@ -1332,7 +1381,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task, trace_ff_layout_read_error(hdr, task->tk_status); } - err = ff_layout_async_handle_error(task, hdr->args.context->state, + err = ff_layout_async_handle_error(task, hdr->res.op_status, + hdr->args.context->state, hdr->ds_clp, hdr->lseg, hdr->pgio_mirror_idx); @@ -1505,7 +1555,8 @@ static int ff_layout_write_done_cb(struct rpc_task *task, trace_ff_layout_write_error(hdr, task->tk_status); } - err = ff_layout_async_handle_error(task, hdr->args.context->state, + err = ff_layout_async_handle_error(task, hdr->res.op_status, + hdr->args.context->state, hdr->ds_clp, hdr->lseg, hdr->pgio_mirror_idx); @@ -1554,8 +1605,9 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, trace_ff_layout_commit_error(data, task->tk_status); } - err = ff_layout_async_handle_error(task, NULL, data->ds_clp, - data->lseg, data->ds_commit_index); + err = ff_layout_async_handle_error(task, data->res.op_status, + NULL, data->ds_clp, data->lseg, + data->ds_commit_index); trace_nfs4_pnfs_commit_ds(data, err); switch (err) { diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 4a304cf17c4b..656d5c50bbce 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -400,7 +400,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, * keep ds_clp even if DS is local, so that if local IO cannot * proceed somehow, we can fall back to NFS whenever we want. */ - nfs_local_probe(ds->ds_clp); + nfs_local_probe_async(ds->ds_clp); max_payload = nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient), NULL); diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index e278a1ad1ca3..8b0785178731 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -367,6 +367,7 @@ void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) sreq = netfs->sreq; if (test_bit(NFS_IOHDR_EOF, &hdr->flags) && + sreq->rreq->origin != NETFS_UNBUFFERED_READ && sreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &sreq->flags); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 119e447758b9..a2fa6bc4d74e 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -557,6 +557,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) set_nlink(inode, fattr->nlink); else if (fattr_supported & NFS_ATTR_FATTR_NLINK) nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK); + else + set_nlink(inode, 1); if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; else if (fattr_supported & NFS_ATTR_FATTR_OWNER) @@ -633,6 +635,34 @@ nfs_fattr_fixup_delegated(struct inode *inode, struct nfs_fattr *fattr) } } +static void nfs_set_timestamps_to_ts(struct inode *inode, struct iattr *attr) +{ + unsigned int cache_flags = 0; + + if (attr->ia_valid & ATTR_MTIME_SET) { + struct timespec64 ctime = inode_get_ctime(inode); + struct timespec64 mtime = inode_get_mtime(inode); + struct timespec64 now; + int updated = 0; + + now = inode_set_ctime_current(inode); + if (!timespec64_equal(&now, &ctime)) + updated |= S_CTIME; + + inode_set_mtime_to_ts(inode, attr->ia_mtime); + if (!timespec64_equal(&now, &mtime)) + updated |= S_MTIME; + + inode_maybe_inc_iversion(inode, updated); + cache_flags |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; + } + if (attr->ia_valid & ATTR_ATIME_SET) { + inode_set_atime_to_ts(inode, attr->ia_atime); + cache_flags |= NFS_INO_INVALID_ATIME; + } + NFS_I(inode)->cache_validity &= ~cache_flags; +} + static void nfs_update_timestamps(struct inode *inode, unsigned int ia_valid) { enum file_time_flags time_flags = 0; @@ -701,14 +731,27 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (nfs_have_delegated_mtime(inode) && attr->ia_valid & ATTR_MTIME) { spin_lock(&inode->i_lock); - nfs_update_timestamps(inode, attr->ia_valid); + if (attr->ia_valid & ATTR_MTIME_SET) { + nfs_set_timestamps_to_ts(inode, attr); + attr->ia_valid &= ~(ATTR_MTIME|ATTR_MTIME_SET| + ATTR_ATIME|ATTR_ATIME_SET); + } else { + nfs_update_timestamps(inode, attr->ia_valid); + attr->ia_valid &= ~(ATTR_MTIME|ATTR_ATIME); + } spin_unlock(&inode->i_lock); - attr->ia_valid &= ~(ATTR_MTIME | ATTR_ATIME); } else if (nfs_have_delegated_atime(inode) && attr->ia_valid & ATTR_ATIME && !(attr->ia_valid & ATTR_MTIME)) { - nfs_update_delegated_atime(inode); - attr->ia_valid &= ~ATTR_ATIME; + if (attr->ia_valid & ATTR_ATIME_SET) { + spin_lock(&inode->i_lock); + nfs_set_timestamps_to_ts(inode, attr); + spin_unlock(&inode->i_lock); + attr->ia_valid &= ~(ATTR_ATIME|ATTR_ATIME_SET); + } else { + nfs_update_delegated_atime(inode); + attr->ia_valid &= ~ATTR_ATIME; + } } /* Optimization: if the end result is no change, don't RPC */ @@ -2546,15 +2589,26 @@ EXPORT_SYMBOL_GPL(nfs_net_id); static int nfs_net_init(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); + int err; nfs_clients_init(net); if (!rpc_proc_register(net, &nn->rpcstats)) { - nfs_clients_exit(net); - return -ENOMEM; + err = -ENOMEM; + goto err_proc_rpc; } - return nfs_fs_proc_net_init(net); + err = nfs_fs_proc_net_init(net); + if (err) + goto err_proc_nfs; + + return 0; + +err_proc_nfs: + rpc_proc_unregister(net, "nfs"); +err_proc_rpc: + nfs_clients_exit(net); + return err; } static void nfs_net_exit(struct net *net) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 6655e5f32ec6..26551ff09a52 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -432,7 +432,7 @@ loff_t nfs_file_llseek(struct file *, loff_t, int); ssize_t nfs_file_read(struct kiocb *, struct iov_iter *); ssize_t nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); -int nfs_file_mmap(struct file *, struct vm_area_struct *); +int nfs_file_mmap_prepare(struct vm_area_desc *); ssize_t nfs_file_write(struct kiocb *, struct iov_iter *); int nfs_file_release(struct inode *, struct file *); int nfs_lock(struct file *, int, struct file_lock *); @@ -455,7 +455,6 @@ extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); #if IS_ENABLED(CONFIG_NFS_LOCALIO) /* localio.c */ -extern void nfs_local_probe(struct nfs_client *); extern void nfs_local_probe_async(struct nfs_client *); extern void nfs_local_probe_async_work(struct work_struct *); extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *, diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 4ec952f9f47d..510d0a16cfe9 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -171,7 +171,7 @@ static bool nfs_server_uuid_is_local(struct nfs_client *clp) * - called after alloc_client and init_client (so cl_rpcclient exists) * - this function is idempotent, it can be called for old or new clients */ -void nfs_local_probe(struct nfs_client *clp) +static void nfs_local_probe(struct nfs_client *clp) { /* Disallow localio if disabled via sysfs or AUTH_SYS isn't used */ if (!localio_enabled || @@ -191,14 +191,16 @@ void nfs_local_probe(struct nfs_client *clp) nfs_localio_enable_client(clp); nfs_uuid_end(&clp->cl_uuid); } -EXPORT_SYMBOL_GPL(nfs_local_probe); void nfs_local_probe_async_work(struct work_struct *work) { struct nfs_client *clp = container_of(work, struct nfs_client, cl_local_probe_work); + if (!refcount_inc_not_zero(&clp->cl_count)) + return; nfs_local_probe(clp); + nfs_put_client(clp); } void nfs_local_probe_async(struct nfs_client *clp) @@ -207,14 +209,16 @@ void nfs_local_probe_async(struct nfs_client *clp) } EXPORT_SYMBOL_GPL(nfs_local_probe_async); -static inline struct nfsd_file *nfs_local_file_get(struct nfsd_file *nf) +static inline void nfs_local_file_put(struct nfsd_file *localio) { - return nfs_to->nfsd_file_get(nf); -} + /* nfs_to_nfsd_file_put_local() expects an __rcu pointer + * but we have a __kernel pointer. It is always safe + * to cast a __kernel pointer to an __rcu pointer + * because the cast only weakens what is known about the pointer. + */ + struct nfsd_file __rcu *nf = (struct nfsd_file __rcu*) localio; -static inline void nfs_local_file_put(struct nfsd_file *nf) -{ - nfs_to->nfsd_file_put(nf); + nfs_to_nfsd_file_put_local(&nf); } /* @@ -226,12 +230,13 @@ static inline void nfs_local_file_put(struct nfsd_file *nf) static struct nfsd_file * __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, struct nfs_fh *fh, struct nfs_file_localio *nfl, + struct nfsd_file __rcu **pnf, const fmode_t mode) { struct nfsd_file *localio; localio = nfs_open_local_fh(&clp->cl_uuid, clp->cl_rpcclient, - cred, fh, nfl, mode); + cred, fh, nfl, pnf, mode); if (IS_ERR(localio)) { int status = PTR_ERR(localio); trace_nfs_local_open_fh(fh, mode, status); @@ -258,7 +263,7 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, struct nfs_fh *fh, struct nfs_file_localio *nfl, const fmode_t mode) { - struct nfsd_file *nf, *new, __rcu **pnf; + struct nfsd_file *nf, __rcu **pnf; if (!nfs_server_is_local(clp)) return NULL; @@ -270,29 +275,9 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, else pnf = &nfl->ro_file; - new = NULL; - rcu_read_lock(); - nf = rcu_dereference(*pnf); - if (!nf) { - rcu_read_unlock(); - new = __nfs_local_open_fh(clp, cred, fh, nfl, mode); - if (IS_ERR(new)) - return NULL; - rcu_read_lock(); - /* try to swap in the pointer */ - spin_lock(&clp->cl_uuid.lock); - nf = rcu_dereference_protected(*pnf, 1); - if (!nf) { - nf = new; - new = NULL; - rcu_assign_pointer(*pnf, nf); - } - spin_unlock(&clp->cl_uuid.lock); - } - nf = nfs_local_file_get(nf); - rcu_read_unlock(); - if (new) - nfs_to_nfsd_file_put_local(new); + nf = __nfs_local_open_fh(clp, cred, fh, nfl, pnf, mode); + if (IS_ERR(nf)) + return NULL; return nf; } EXPORT_SYMBOL_GPL(nfs_local_open_fh); diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index 0282d93c8bcc..aafd15a4afce 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -21,6 +21,7 @@ int nfs42_proc_allocate(struct file *, loff_t, loff_t); ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t, struct nl4_server *, nfs4_stateid *, bool); int nfs42_proc_deallocate(struct file *, loff_t, loff_t); +int nfs42_proc_zero_range(struct file *, loff_t, loff_t); loff_t nfs42_proc_llseek(struct file *, loff_t, int); int nfs42_proc_layoutstats_generic(struct nfs_server *, struct nfs42_layoutstat_data *); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 5cf52ece96ac..01c01f45358b 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -146,7 +146,8 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == -EOPNOTSUPP) - NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE; + NFS_SERVER(inode)->caps &= ~(NFS_CAP_ALLOCATE | + NFS_CAP_ZERO_RANGE); inode_unlock(inode); return err; @@ -169,7 +170,31 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) if (err == 0) truncate_pagecache_range(inode, offset, (offset + len) -1); if (err == -EOPNOTSUPP) - NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; + NFS_SERVER(inode)->caps &= ~(NFS_CAP_DEALLOCATE | + NFS_CAP_ZERO_RANGE); + + inode_unlock(inode); + return err; +} + +int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ZERO_RANGE], + }; + struct inode *inode = file_inode(filep); + int err; + + if (!nfs_server_capable(inode, NFS_CAP_ZERO_RANGE)) + return -EOPNOTSUPP; + + inode_lock(inode); + + err = nfs42_proc_fallocate(&msg, filep, offset, len); + if (err == 0) + truncate_pagecache_range(inode, offset, (offset + len) -1); + if (err == -EOPNOTSUPP) + NFS_SERVER(inode)->caps &= ~NFS_CAP_ZERO_RANGE; inode_unlock(inode); return err; diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index b1b663468249..4cc915d5741d 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -174,6 +174,18 @@ decode_putfh_maxsz + \ decode_deallocate_maxsz + \ decode_getattr_maxsz) +#define NFS4_enc_zero_range_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_deallocate_maxsz + \ + encode_allocate_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_zero_range_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_deallocate_maxsz + \ + decode_allocate_maxsz + \ + decode_getattr_maxsz) #define NFS4_enc_read_plus_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -649,6 +661,27 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req, } /* + * Encode ZERO_RANGE request + */ +static void nfs4_xdr_enc_zero_range(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_falloc_args *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->falloc_fh, &hdr); + encode_deallocate(xdr, args, &hdr); + encode_allocate(xdr, args, &hdr); + encode_getfattr(xdr, args->falloc_bitmask, &hdr); + encode_nops(&hdr); +} + +/* * Encode READ_PLUS request */ static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req, @@ -1511,6 +1544,37 @@ out: } /* + * Decode ZERO_RANGE request + */ +static int nfs4_xdr_dec_zero_range(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfs42_falloc_res *res = data; + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_deallocate(xdr, res); + if (status) + goto out; + status = decode_allocate(xdr, res); + if (status) + goto out; + decode_getfattr(xdr, res->falloc_fattr, res->falloc_server); +out: + return status; +} + +/* * Decode READ_PLUS request */ static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 7d383d29a995..d3ca91f60fc1 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -67,8 +67,7 @@ struct nfs4_minor_version_ops { void (*free_lock_state)(struct nfs_server *, struct nfs4_lock_state *); int (*test_and_free_expired)(struct nfs_server *, - const nfs4_stateid *, - const struct cred *); + nfs4_stateid *, const struct cred *); struct nfs_seqid * (*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); void (*session_trunk)(struct rpc_clnt *clnt, diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 1cd9652f3c28..5c749b6117bb 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -225,8 +225,14 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if ((mode != 0) && (mode != (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE))) + switch (mode) { + case 0: + case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: + case FALLOC_FL_ZERO_RANGE: + break; + default: return -EOPNOTSUPP; + } ret = inode_newsize_ok(inode, offset + len); if (ret < 0) @@ -234,6 +240,8 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t if (mode & FALLOC_FL_PUNCH_HOLE) return nfs42_proc_deallocate(filep, offset, len); + else if (mode & FALLOC_FL_ZERO_RANGE) + return nfs42_proc_zero_range(filep, offset ,len); return nfs42_proc_allocate(filep, offset, len); } @@ -448,7 +456,7 @@ static int nfs4_setlease(struct file *file, int arg, struct file_lease **lease, const struct file_operations nfs4_file_operations = { .read_iter = nfs_file_read, .write_iter = nfs_file_write, - .mmap = nfs_file_mmap, + .mmap_prepare = nfs_file_mmap_prepare, .open = nfs4_file_open, .flush = nfs4_file_flush, .release = nfs_file_release, diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 25a7c771cfd8..00932500fce4 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -424,26 +424,16 @@ static void nfs_idmap_pipe_destroy(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct idmap *idmap = pdo->pdo_data; - struct rpc_pipe *pipe = idmap->idmap_pipe; - if (pipe->dentry) { - rpc_unlink(pipe->dentry); - pipe->dentry = NULL; - } + rpc_unlink(idmap->idmap_pipe); } static int nfs_idmap_pipe_create(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct idmap *idmap = pdo->pdo_data; - struct rpc_pipe *pipe = idmap->idmap_pipe; - struct dentry *dentry; - dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - pipe->dentry = dentry; - return 0; + return rpc_mkpipe_dentry(dir, "idmap", idmap, idmap->idmap_pipe); } static const struct rpc_pipe_dir_object_ops nfs_idmap_pipe_dir_object_ops = { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b1d2122bd5a7..341740fa293d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -105,7 +105,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, bool is_privileged); static int nfs41_test_stateid(struct nfs_server *, const nfs4_stateid *, const struct cred *); -static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *, +static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *, const struct cred *, bool); #endif @@ -325,14 +325,14 @@ static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src, if (nfs_have_delegated_mtime(inode)) { if (!(cache_validity & NFS_INO_INVALID_ATIME)) - dst[1] &= ~FATTR4_WORD1_TIME_ACCESS; + dst[1] &= ~(FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET); if (!(cache_validity & NFS_INO_INVALID_MTIME)) - dst[1] &= ~FATTR4_WORD1_TIME_MODIFY; + dst[1] &= ~(FATTR4_WORD1_TIME_MODIFY|FATTR4_WORD1_TIME_MODIFY_SET); if (!(cache_validity & NFS_INO_INVALID_CTIME)) - dst[1] &= ~FATTR4_WORD1_TIME_METADATA; + dst[1] &= ~(FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY_SET); } else if (nfs_have_delegated_atime(inode)) { if (!(cache_validity & NFS_INO_INVALID_ATIME)) - dst[1] &= ~FATTR4_WORD1_TIME_ACCESS; + dst[1] &= ~(FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET); } } @@ -2903,16 +2903,14 @@ static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st } static int nfs40_test_and_free_expired_stateid(struct nfs_server *server, - const nfs4_stateid *stateid, - const struct cred *cred) + nfs4_stateid *stateid, const struct cred *cred) { return -NFS4ERR_BAD_STATEID; } #if defined(CONFIG_NFS_V4_1) static int nfs41_test_and_free_expired_stateid(struct nfs_server *server, - const nfs4_stateid *stateid, - const struct cred *cred) + nfs4_stateid *stateid, const struct cred *cred) { int status; @@ -2921,6 +2919,7 @@ static int nfs41_test_and_free_expired_stateid(struct nfs_server *server, break; case NFS4_INVALID_STATEID_TYPE: case NFS4_SPECIAL_STATEID_TYPE: + case NFS4_FREED_STATEID_TYPE: return -NFS4ERR_BAD_STATEID; case NFS4_REVOKED_STATEID_TYPE: goto out_free; @@ -3976,8 +3975,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f FATTR4_WORD0_CASE_INSENSITIVE | FATTR4_WORD0_CASE_PRESERVING; if (minorversion) - bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT | - FATTR4_WORD2_OPEN_ARGUMENTS; + bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT; + if (minorversion > 1) + bitmask[2] |= FATTR4_WORD2_OPEN_ARGUMENTS; status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == 0) { @@ -5164,13 +5164,15 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_ } static struct dentry *nfs4_do_mkdir(struct inode *dir, struct dentry *dentry, - struct nfs4_createdata *data) + struct nfs4_createdata *data, int *statusp) { - int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, + struct dentry *ret; + + *statusp = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, &data->arg.seq_args, &data->res.seq_res, 1); - if (status) - return ERR_PTR(status); + if (*statusp) + return NULL; spin_lock(&dir->i_lock); /* Creating a directory bumps nlink in the parent */ @@ -5179,7 +5181,11 @@ static struct dentry *nfs4_do_mkdir(struct inode *dir, struct dentry *dentry, data->res.fattr->time_start, NFS_INO_INVALID_DATA); spin_unlock(&dir->i_lock); - return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr); + ret = nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr); + if (!IS_ERR(ret)) + return ret; + *statusp = PTR_ERR(ret); + return NULL; } static void nfs4_free_createdata(struct nfs4_createdata *data) @@ -5240,17 +5246,18 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, static struct dentry *_nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr, - struct nfs4_label *label) + struct nfs4_label *label, int *statusp) { struct nfs4_createdata *data; - struct dentry *ret = ERR_PTR(-ENOMEM); + struct dentry *ret = NULL; + *statusp = -ENOMEM; data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR); if (data == NULL) goto out; data->arg.label = label; - ret = nfs4_do_mkdir(dir, dentry, data); + ret = nfs4_do_mkdir(dir, dentry, data, statusp); nfs4_free_createdata(data); out: @@ -5273,11 +5280,12 @@ static struct dentry *nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) sattr->ia_mode &= ~current_umask(); do { - alias = _nfs4_proc_mkdir(dir, dentry, sattr, label); - err = PTR_ERR_OR_ZERO(alias); + alias = _nfs4_proc_mkdir(dir, dentry, sattr, label, &err); trace_nfs4_mkdir(dir, &dentry->d_name, err); - err = nfs4_handle_exception(NFS_SERVER(dir), err, - &exception); + if (err) + alias = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir), + err, + &exception)); } while (exception.retry); nfs4_label_release_security(label); @@ -6211,6 +6219,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen, struct nfs_server *server = NFS_SERVER(inode); int ret; + if (unlikely(NFS_FH(inode)->size == 0)) + return -ENODATA; if (!nfs4_server_supports_acls(server, type)) return -EOPNOTSUPP; ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); @@ -6285,6 +6295,9 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, { struct nfs4_exception exception = { }; int err; + + if (unlikely(NFS_FH(inode)->size == 0)) + return -ENODATA; do { err = __nfs4_proc_set_acl(inode, buf, buflen, type); trace_nfs4_set_acl(inode, err); @@ -10611,7 +10624,7 @@ static const struct rpc_call_ops nfs41_free_stateid_ops = { * Note: this function is always asynchronous. */ static int nfs41_free_stateid(struct nfs_server *server, - const nfs4_stateid *stateid, + nfs4_stateid *stateid, const struct cred *cred, bool privileged) { @@ -10651,6 +10664,7 @@ static int nfs41_free_stateid(struct nfs_server *server, if (IS_ERR(task)) return PTR_ERR(task); rpc_put_task(task); + stateid->type = NFS4_FREED_STATEID_TYPE; return 0; } @@ -10817,6 +10831,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_OFFLOAD_CANCEL | NFS_CAP_COPY_NOTIFY | NFS_CAP_DEALLOCATE + | NFS_CAP_ZERO_RANGE | NFS_CAP_SEEK | NFS_CAP_LAYOUTSTATS | NFS_CAP_CLONE @@ -10852,7 +10867,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) { - ssize_t error, error2, error3; + ssize_t error, error2, error3, error4; size_t left = size; error = generic_listxattr(dentry, list, left); @@ -10875,8 +10890,16 @@ static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, left); if (error3 < 0) return error3; + if (list) { + list += error3; + left -= error3; + } + + error4 = security_inode_listsecurity(d_inode(dentry), list, left); + if (error4 < 0) + return error4; - error += error2 + error3; + error += error2 + error3 + error4; if (size && error > size) return -ERANGE; return error; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 55bef5fbfa47..318afde38057 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -7711,6 +7711,7 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC42(LISTXATTRS, enc_listxattrs, dec_listxattrs), PROC42(REMOVEXATTR, enc_removexattr, dec_removexattr), PROC42(READ_PLUS, enc_read_plus, dec_read_plus), + PROC42(ZERO_RANGE, enc_zero_range, dec_zero_range), }; static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)]; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 3adb7d0dbec7..1a7ec68bde15 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2059,8 +2059,10 @@ static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo) static void nfs_layoutget_end(struct pnfs_layout_hdr *lo) { if (atomic_dec_and_test(&lo->plh_outstanding) && - test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) + test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) { + smp_mb__after_atomic(); wake_up_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN); + } } static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo) diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 91ef486f40b9..b4ccdf78d4dd 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -830,10 +830,16 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, .servername = clp->cl_hostname, .connect_timeout = connect_timeout, .reconnect_timeout = connect_timeout, + .xprtsec = clp->cl_xprtsec, }; - if (da->da_transport != clp->cl_proto) + if (da->da_transport != clp->cl_proto && + clp->cl_proto != XPRT_TRANSPORT_TCP_TLS) continue; + if (da->da_transport == XPRT_TRANSPORT_TCP && + mds_srv->nfs_client->cl_proto == XPRT_TRANSPORT_TCP_TLS) + xprt_args.ident = XPRT_TRANSPORT_TCP_TLS; + if (da->da_addr.ss_family != clp->cl_addr.ss_family) continue; /* Add this address as an alias */ @@ -841,6 +847,9 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, rpc_clnt_test_and_add_xprt, NULL); continue; } + if (da->da_transport == XPRT_TRANSPORT_TCP && + mds_srv->nfs_client->cl_proto == XPRT_TRANSPORT_TCP_TLS) + da->da_transport = XPRT_TRANSPORT_TCP_TLS; clp = get_v3_ds_connect(mds_srv, &da->da_addr, da->da_addrlen, da->da_transport, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 81bd1b9aba17..3c1fa320b3f1 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -56,7 +56,8 @@ static int nfs_return_empty_folio(struct folio *folio) { folio_zero_segment(folio, 0, folio_size(folio)); folio_mark_uptodate(folio); - folio_unlock(folio); + if (nfs_netfs_folio_unlock(folio)) + folio_unlock(folio); return 0; } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 9eea9e62afc9..72dee6f3050e 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1052,6 +1052,16 @@ int nfs_reconfigure(struct fs_context *fc) sync_filesystem(sb); /* + * The SB_RDONLY flag has been removed from the superblock during + * mounts to prevent interference between different filesystems. + * Similarly, it is also necessary to ignore the SB_RDONLY flag + * during reconfiguration; otherwise, it may also result in the + * creation of redundant superblocks when mounting a directory with + * different rw and ro flags multiple times. + */ + fc->sb_flags_mask &= ~SB_RDONLY; + + /* * Userspace mount programs that send binary options generally send * them populated with default values. We have no way to know which * ones were explicitly specified. Fall back to legacy behavior and @@ -1173,7 +1183,7 @@ static int nfs_set_super(struct super_block *s, struct fs_context *fc) struct nfs_server *server = fc->s_fs_info; int ret; - s->s_d_op = server->nfs_client->rpc_ops->dentry_ops; + set_default_d_op(s, server->nfs_client->rpc_ops->dentry_ops); ret = set_anon_super(s, server); if (ret == 0) server->s_dev = s->s_dev; @@ -1308,8 +1318,17 @@ int nfs_get_tree_common(struct fs_context *fc) if (IS_ERR(server)) return PTR_ERR(server); + /* + * When NFS_MOUNT_UNSHARED is not set, NFS forces the sharing of a + * superblock among each filesystem that mounts sub-directories + * belonging to a single exported root path. + * To prevent interference between different filesystems, the + * SB_RDONLY flag should be removed from the superblock. + */ if (server->flags & NFS_MOUNT_UNSHARED) compare_super = NULL; + else + fc->sb_flags &= ~SB_RDONLY; /* -o noac implies -o sync */ if (server->flags & NFS_MOUNT_NOAC) diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index 37cb2b776435..545148d42dcc 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -387,6 +387,33 @@ static inline void nfs_sysfs_add_nfsv41_server(struct nfs_server *server) } #endif /* CONFIG_NFS_V4_1 */ +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + +static ssize_t +localio_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct nfs_server *server = container_of(kobj, struct nfs_server, kobj); + bool localio = nfs_server_is_local(server->nfs_client); + return sysfs_emit(buf, "%d\n", localio); +} + +static struct kobj_attribute nfs_sysfs_attr_localio = __ATTR_RO(localio); + +static void nfs_sysfs_add_nfs_localio_server(struct nfs_server *server) +{ + int ret = sysfs_create_file_ns(&server->kobj, &nfs_sysfs_attr_localio.attr, + nfs_netns_server_namespace(&server->kobj)); + if (ret < 0) + pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n", + server->s_sysfs_id, ret); +} +#else +static inline void nfs_sysfs_add_nfs_localio_server(struct nfs_server *server) +{ +} +#endif /* IS_ENABLED(CONFIG_NFS_LOCALIO) */ + void nfs_sysfs_add_server(struct nfs_server *server) { int ret; @@ -405,6 +432,7 @@ void nfs_sysfs_add_server(struct nfs_server *server) server->s_sysfs_id, ret); nfs_sysfs_add_nfsv41_server(server); + nfs_sysfs_add_nfs_localio_server(server); } EXPORT_SYMBOL_GPL(nfs_sysfs_add_server); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 23df8b214474..cf1d720b8251 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -632,19 +632,19 @@ static void nfs_write_error(struct nfs_page *req, int error) * Find an associated nfs write request, and prepare to flush it out * May return an error if the user signalled nfs_wait_on_request(). */ -static int nfs_page_async_flush(struct folio *folio, - struct writeback_control *wbc, - struct nfs_pageio_descriptor *pgio) +static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc, + struct nfs_pageio_descriptor *pgio) { struct nfs_page *req; - int ret = 0; + int ret; + + nfs_pageio_cond_complete(pgio, folio->index); req = nfs_lock_and_join_requests(folio); if (!req) - goto out; - ret = PTR_ERR(req); + return 0; if (IS_ERR(req)) - goto out; + return PTR_ERR(req); nfs_folio_set_writeback(folio); WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); @@ -654,7 +654,6 @@ static int nfs_page_async_flush(struct folio *folio, if (nfs_error_is_fatal_on_server(ret)) goto out_launder; - ret = 0; if (!nfs_pageio_add_request(pgio, req)) { ret = pgio->pg_error; /* @@ -662,28 +661,20 @@ static int nfs_page_async_flush(struct folio *folio, */ if (nfs_error_is_fatal_on_server(ret)) goto out_launder; - if (wbc->sync_mode == WB_SYNC_NONE) - ret = AOP_WRITEPAGE_ACTIVATE; folio_redirty_for_writepage(wbc, folio); nfs_redirty_request(req); pgio->pg_error = 0; - } else - nfs_add_stats(folio->mapping->host, - NFSIOS_WRITEPAGES, 1); -out: - return ret; + return ret; + } + + nfs_add_stats(folio->mapping->host, NFSIOS_WRITEPAGES, 1); + return 0; + out_launder: nfs_write_error(req, ret); return 0; } -static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc, - struct nfs_pageio_descriptor *pgio) -{ - nfs_pageio_cond_complete(pgio, folio->index); - return nfs_page_async_flush(folio, wbc, pgio); -} - /* * Write an mmapped page to the server. */ @@ -703,17 +694,6 @@ static int nfs_writepage_locked(struct folio *folio, return err; } -static int nfs_writepages_callback(struct folio *folio, - struct writeback_control *wbc, void *data) -{ - int ret; - - ret = nfs_do_writepage(folio, wbc, data); - if (ret != AOP_WRITEPAGE_ACTIVATE) - folio_unlock(folio); - return ret; -} - static void nfs_io_completion_commit(void *inode) { nfs_commit_inode(inode, 0); @@ -740,7 +720,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); if (!(mntflags & NFS_MOUNT_WRITE_EAGER) || wbc->for_kupdate || - wbc->for_background || wbc->for_sync || wbc->for_reclaim) { + wbc->for_background || wbc->for_sync) { ioc = nfs_io_completion_alloc(GFP_KERNEL); if (ioc) nfs_io_completion_init(ioc, nfs_io_completion_commit, @@ -749,11 +729,15 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) } do { + struct folio *folio = NULL; + nfs_pageio_init_write(&pgio, inode, priority, false, &nfs_async_write_completion_ops); pgio.pg_io_completion = ioc; - err = write_cache_pages(mapping, wbc, nfs_writepages_callback, - &pgio); + while ((folio = writeback_iter(mapping, wbc, folio, &err))) { + err = nfs_do_writepage(folio, wbc, &pgio); + folio_unlock(folio); + } pgio.pg_error = 0; nfs_pageio_complete(&pgio); if (err == -EAGAIN && mntflags & NFS_MOUNT_SOFTERR) diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c index 6a0bdea6d644..05c7c16e37ab 100644 --- a/fs/nfs_common/nfslocalio.c +++ b/fs/nfs_common/nfslocalio.c @@ -151,8 +151,7 @@ EXPORT_SYMBOL_GPL(nfs_localio_enable_client); */ static bool nfs_uuid_put(nfs_uuid_t *nfs_uuid) { - LIST_HEAD(local_files); - struct nfs_file_localio *nfl, *tmp; + struct nfs_file_localio *nfl; spin_lock(&nfs_uuid->lock); if (unlikely(!rcu_access_pointer(nfs_uuid->net))) { @@ -166,17 +165,42 @@ static bool nfs_uuid_put(nfs_uuid_t *nfs_uuid) nfs_uuid->dom = NULL; } - list_splice_init(&nfs_uuid->files, &local_files); - spin_unlock(&nfs_uuid->lock); - /* Walk list of files and ensure their last references dropped */ - list_for_each_entry_safe(nfl, tmp, &local_files, list) { - nfs_close_local_fh(nfl); + + while ((nfl = list_first_entry_or_null(&nfs_uuid->files, + struct nfs_file_localio, + list)) != NULL) { + /* If nfs_uuid is already NULL, nfs_close_local_fh is + * closing and we must wait, else we unlink and close. + */ + if (rcu_access_pointer(nfl->nfs_uuid) == NULL) { + /* nfs_close_local_fh() is doing the + * close and we must wait. until it unlinks + */ + wait_var_event_spinlock(nfl, + list_first_entry_or_null( + &nfs_uuid->files, + struct nfs_file_localio, + list) != nfl, + &nfs_uuid->lock); + continue; + } + + /* Remove nfl from nfs_uuid->files list */ + list_del_init(&nfl->list); + spin_unlock(&nfs_uuid->lock); + + nfs_to_nfsd_file_put_local(&nfl->ro_file); + nfs_to_nfsd_file_put_local(&nfl->rw_file); cond_resched(); - } - spin_lock(&nfs_uuid->lock); - BUG_ON(!list_empty(&nfs_uuid->files)); + spin_lock(&nfs_uuid->lock); + /* Now we can allow racing nfs_close_local_fh() to + * skip the locking. + */ + RCU_INIT_POINTER(nfl->nfs_uuid, NULL); + wake_up_var_locked(&nfl->nfs_uuid, &nfs_uuid->lock); + } /* Remove client from nn->local_clients */ if (nfs_uuid->list_lock) { @@ -237,6 +261,7 @@ static void nfs_uuid_add_file(nfs_uuid_t *nfs_uuid, struct nfs_file_localio *nfl struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid, struct rpc_clnt *rpc_clnt, const struct cred *cred, const struct nfs_fh *nfs_fh, struct nfs_file_localio *nfl, + struct nfsd_file __rcu **pnf, const fmode_t fmode) { struct net *net; @@ -261,10 +286,9 @@ struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid, rcu_read_unlock(); /* We have an implied reference to net thanks to nfsd_net_try_get */ localio = nfs_to->nfsd_open_local_fh(net, uuid->dom, rpc_clnt, - cred, nfs_fh, fmode); - if (IS_ERR(localio)) - nfs_to_nfsd_net_put(net); - else + cred, nfs_fh, pnf, fmode); + nfs_to_nfsd_net_put(net); + if (!IS_ERR(localio)) nfs_uuid_add_file(uuid, nfl); return localio; @@ -273,8 +297,6 @@ EXPORT_SYMBOL_GPL(nfs_open_local_fh); void nfs_close_local_fh(struct nfs_file_localio *nfl) { - struct nfsd_file *ro_nf = NULL; - struct nfsd_file *rw_nf = NULL; nfs_uuid_t *nfs_uuid; rcu_read_lock(); @@ -285,28 +307,39 @@ void nfs_close_local_fh(struct nfs_file_localio *nfl) return; } - ro_nf = rcu_access_pointer(nfl->ro_file); - rw_nf = rcu_access_pointer(nfl->rw_file); - if (ro_nf || rw_nf) { - spin_lock(&nfs_uuid->lock); - if (ro_nf) - ro_nf = rcu_dereference_protected(xchg(&nfl->ro_file, NULL), 1); - if (rw_nf) - rw_nf = rcu_dereference_protected(xchg(&nfl->rw_file, NULL), 1); - - /* Remove nfl from nfs_uuid->files list */ - RCU_INIT_POINTER(nfl->nfs_uuid, NULL); - list_del_init(&nfl->list); + spin_lock(&nfs_uuid->lock); + if (!rcu_access_pointer(nfl->nfs_uuid)) { + /* nfs_uuid_put has finished here */ spin_unlock(&nfs_uuid->lock); rcu_read_unlock(); - - if (ro_nf) - nfs_to_nfsd_file_put_local(ro_nf); - if (rw_nf) - nfs_to_nfsd_file_put_local(rw_nf); return; } + if (list_empty(&nfs_uuid->files)) { + /* nfs_uuid_put() has started closing files, wait for it + * to finished + */ + spin_unlock(&nfs_uuid->lock); + rcu_read_unlock(); + wait_var_event(&nfl->nfs_uuid, + rcu_access_pointer(nfl->nfs_uuid) == NULL); + return; + } + /* tell nfs_uuid_put() to wait for us */ + RCU_INIT_POINTER(nfl->nfs_uuid, NULL); + spin_unlock(&nfs_uuid->lock); rcu_read_unlock(); + + nfs_to_nfsd_file_put_local(&nfl->ro_file); + nfs_to_nfsd_file_put_local(&nfl->rw_file); + + /* Remove nfl from nfs_uuid->files list and signal nfs_uuid_put() + * that we are done. The moment we drop the spinlock the + * nfs_uuid could be freed. + */ + spin_lock(&nfs_uuid->lock); + list_del_init(&nfl->list); + wake_up_var_locked(&nfl->nfs_uuid, &nfs_uuid->lock); + spin_unlock(&nfs_uuid->lock); } EXPORT_SYMBOL_GPL(nfs_close_local_fh); diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 08a20e5bcf7f..19078a043e85 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -178,11 +178,13 @@ nfsd4_block_proc_layoutcommit(struct inode *inode, { struct iomap *iomaps; int nr_iomaps; + __be32 nfserr; - nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout, - lcp->lc_up_len, &iomaps, i_blocksize(inode)); - if (nr_iomaps < 0) - return nfserrno(nr_iomaps); + nfserr = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout, + lcp->lc_up_len, &iomaps, &nr_iomaps, + i_blocksize(inode)); + if (nfserr != nfs_ok) + return nfserr; return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps); } @@ -316,11 +318,13 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, { struct iomap *iomaps; int nr_iomaps; + __be32 nfserr; - nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout, - lcp->lc_up_len, &iomaps, i_blocksize(inode)); - if (nr_iomaps < 0) - return nfserrno(nr_iomaps); + nfserr = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout, + lcp->lc_up_len, &iomaps, &nr_iomaps, + i_blocksize(inode)); + if (nfserr != nfs_ok) + return nfserr; return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps); } diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index ce78f74715ee..bcf21fde9120 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -112,35 +112,46 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, return 0; } -int +/** + * nfsd4_block_decode_layoutupdate - decode the block layout extent array + * @p: pointer to the xdr data + * @len: number of bytes to decode + * @iomapp: pointer to store the decoded extent array + * @nr_iomapsp: pointer to store the number of extents + * @block_size: alignment of extent offset and length + * + * This function decodes the opaque field of the layoutupdate4 structure + * in a layoutcommit request for the block layout driver. The field is + * actually an array of extents sent by the client. It also checks that + * the file offset, storage offset and length of each extent are aligned + * by @block_size. + * + * Return values: + * %nfs_ok: Successful decoding, @iomapp and @nr_iomapsp are valid + * %nfserr_bad_xdr: The encoded array in @p is invalid + * %nfserr_inval: An unaligned extent found + * %nfserr_delay: Failed to allocate memory for @iomapp + */ +__be32 nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, - u32 block_size) + int *nr_iomapsp, u32 block_size) { struct iomap *iomaps; u32 nr_iomaps, i; - if (len < sizeof(u32)) { - dprintk("%s: extent array too small: %u\n", __func__, len); - return -EINVAL; - } + if (len < sizeof(u32)) + return nfserr_bad_xdr; len -= sizeof(u32); - if (len % PNFS_BLOCK_EXTENT_SIZE) { - dprintk("%s: extent array invalid: %u\n", __func__, len); - return -EINVAL; - } + if (len % PNFS_BLOCK_EXTENT_SIZE) + return nfserr_bad_xdr; nr_iomaps = be32_to_cpup(p++); - if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) { - dprintk("%s: extent array size mismatch: %u/%u\n", - __func__, len, nr_iomaps); - return -EINVAL; - } + if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) + return nfserr_bad_xdr; iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL); - if (!iomaps) { - dprintk("%s: failed to allocate extent array\n", __func__); - return -ENOMEM; - } + if (!iomaps) + return nfserr_delay; for (i = 0; i < nr_iomaps; i++) { struct pnfs_block_extent bex; @@ -150,26 +161,18 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, p = xdr_decode_hyper(p, &bex.foff); if (bex.foff & (block_size - 1)) { - dprintk("%s: unaligned offset 0x%llx\n", - __func__, bex.foff); goto fail; } p = xdr_decode_hyper(p, &bex.len); if (bex.len & (block_size - 1)) { - dprintk("%s: unaligned length 0x%llx\n", - __func__, bex.foff); goto fail; } p = xdr_decode_hyper(p, &bex.soff); if (bex.soff & (block_size - 1)) { - dprintk("%s: unaligned disk offset 0x%llx\n", - __func__, bex.soff); goto fail; } bex.es = be32_to_cpup(p++); if (bex.es != PNFS_BLOCK_READWRITE_DATA) { - dprintk("%s: incorrect extent state %d\n", - __func__, bex.es); goto fail; } @@ -178,59 +181,71 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, } *iomapp = iomaps; - return nr_iomaps; + *nr_iomapsp = nr_iomaps; + return nfs_ok; fail: kfree(iomaps); - return -EINVAL; + return nfserr_inval; } -int +/** + * nfsd4_scsi_decode_layoutupdate - decode the scsi layout extent array + * @p: pointer to the xdr data + * @len: number of bytes to decode + * @iomapp: pointer to store the decoded extent array + * @nr_iomapsp: pointer to store the number of extents + * @block_size: alignment of extent offset and length + * + * This function decodes the opaque field of the layoutupdate4 structure + * in a layoutcommit request for the scsi layout driver. The field is + * actually an array of extents sent by the client. It also checks that + * the offset and length of each extent are aligned by @block_size. + * + * Return values: + * %nfs_ok: Successful decoding, @iomapp and @nr_iomapsp are valid + * %nfserr_bad_xdr: The encoded array in @p is invalid + * %nfserr_inval: An unaligned extent found + * %nfserr_delay: Failed to allocate memory for @iomapp + */ +__be32 nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, - u32 block_size) + int *nr_iomapsp, u32 block_size) { struct iomap *iomaps; u32 nr_iomaps, expected, i; - if (len < sizeof(u32)) { - dprintk("%s: extent array too small: %u\n", __func__, len); - return -EINVAL; - } + if (len < sizeof(u32)) + return nfserr_bad_xdr; nr_iomaps = be32_to_cpup(p++); expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE; - if (len != expected) { - dprintk("%s: extent array size mismatch: %u/%u\n", - __func__, len, expected); - return -EINVAL; - } + if (len != expected) + return nfserr_bad_xdr; iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL); - if (!iomaps) { - dprintk("%s: failed to allocate extent array\n", __func__); - return -ENOMEM; - } + if (!iomaps) + return nfserr_delay; for (i = 0; i < nr_iomaps; i++) { u64 val; p = xdr_decode_hyper(p, &val); if (val & (block_size - 1)) { - dprintk("%s: unaligned offset 0x%llx\n", __func__, val); goto fail; } iomaps[i].offset = val; p = xdr_decode_hyper(p, &val); if (val & (block_size - 1)) { - dprintk("%s: unaligned length 0x%llx\n", __func__, val); goto fail; } iomaps[i].length = val; } *iomapp = iomaps; - return nr_iomaps; + *nr_iomapsp = nr_iomaps; + return nfs_ok; fail: kfree(iomaps); - return -EINVAL; + return nfserr_inval; } diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h index 4e28ac8f1127..15b3569f3d9a 100644 --- a/fs/nfsd/blocklayoutxdr.h +++ b/fs/nfsd/blocklayoutxdr.h @@ -54,9 +54,9 @@ __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, const struct nfsd4_getdeviceinfo *gdp); __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr, const struct nfsd4_layoutget *lgp); -int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, - u32 block_size); -int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, - u32 block_size); +__be32 nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, + struct iomap **iomapp, int *nr_iomapsp, u32 block_size); +__be32 nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, + struct iomap **iomapp, int *nr_iomapsp, u32 block_size); #endif /* _NFSD_BLOCKLAYOUTXDR_H */ diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 88ae410b4113..cadfc2bae60e 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -82,8 +82,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) int len; struct auth_domain *dom = NULL; int err; - int fsidtype; - char *ep; + u8 fsidtype; struct svc_expkey key; struct svc_expkey *ek = NULL; @@ -109,10 +108,9 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) err = -EINVAL; if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; - fsidtype = simple_strtoul(buf, &ep, 10); - if (*ep) + if (kstrtou8(buf, 10, &fsidtype)) goto out; - dprintk("found fsidtype %d\n", fsidtype); + dprintk("found fsidtype %u\n", fsidtype); if (key_len(fsidtype)==0) /* invalid type */ goto out; if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index 4d92b99c1ffd..b9c0adb3ce09 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -88,7 +88,7 @@ struct svc_expkey { struct cache_head h; struct auth_domain * ek_client; - int ek_fsidtype; + u8 ek_fsidtype; u32 ek_fsid[6]; struct path ek_path; diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index ab85e6a2454f..732abf6b92a5 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -372,21 +372,47 @@ nfsd_file_put(struct nfsd_file *nf) /** * nfsd_file_put_local - put nfsd_file reference and arm nfsd_net_put in caller - * @nf: nfsd_file of which to put the reference + * @pnf: nfsd_file of which to put the reference * * First save the associated net to return to caller, then put * the reference of the nfsd_file. */ struct net * -nfsd_file_put_local(struct nfsd_file *nf) +nfsd_file_put_local(struct nfsd_file __rcu **pnf) { - struct net *net = nf->nf_net; + struct nfsd_file *nf; + struct net *net = NULL; - nfsd_file_put(nf); + nf = unrcu_pointer(xchg(pnf, NULL)); + if (nf) { + net = nf->nf_net; + nfsd_file_put(nf); + } return net; } /** + * nfsd_file_get_local - get nfsd_file reference and reference to net + * @nf: nfsd_file of which to put the reference + * + * Get reference to both the nfsd_file and nf->nf_net. + */ +struct nfsd_file * +nfsd_file_get_local(struct nfsd_file *nf) +{ + struct net *net = nf->nf_net; + + if (nfsd_net_try_get(net)) { + nf = nfsd_file_get(nf); + if (!nf) + nfsd_net_put(net); + } else { + nf = NULL; + } + return nf; +} + +/** * nfsd_file_file - get the backing file of an nfsd_file * @nf: nfsd_file of which to access the backing file. * diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 5865f9c72712..722b26c71e45 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -62,7 +62,8 @@ void nfsd_file_cache_shutdown(void); int nfsd_file_cache_start_net(struct net *net); void nfsd_file_cache_shutdown_net(struct net *net); void nfsd_file_put(struct nfsd_file *nf); -struct net *nfsd_file_put_local(struct nfsd_file *nf); +struct net *nfsd_file_put_local(struct nfsd_file __rcu **nf); +struct nfsd_file *nfsd_file_get_local(struct nfsd_file *nf); struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); struct file *nfsd_file_file(struct nfsd_file *nf); void nfsd_file_close_inode_sync(struct inode *inode); diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c index 238647fa379e..4f6468eb2adf 100644 --- a/fs/nfsd/localio.c +++ b/fs/nfsd/localio.c @@ -24,21 +24,6 @@ #include "filecache.h" #include "cache.h" -static const struct nfsd_localio_operations nfsd_localio_ops = { - .nfsd_net_try_get = nfsd_net_try_get, - .nfsd_net_put = nfsd_net_put, - .nfsd_open_local_fh = nfsd_open_local_fh, - .nfsd_file_put_local = nfsd_file_put_local, - .nfsd_file_get = nfsd_file_get, - .nfsd_file_put = nfsd_file_put, - .nfsd_file_file = nfsd_file_file, -}; - -void nfsd_localio_ops_init(void) -{ - nfs_to = &nfsd_localio_ops; -} - /** * nfsd_open_local_fh - lookup a local filehandle @nfs_fh and map to nfsd_file * @@ -47,6 +32,7 @@ void nfsd_localio_ops_init(void) * @rpc_clnt: rpc_clnt that the client established * @cred: cred that the client established * @nfs_fh: filehandle to lookup + * @pnf: place to find the nfsd_file, or store it if it was non-NULL * @fmode: fmode_t to use for open * * This function maps a local fh to a path on a local filesystem. @@ -57,10 +43,11 @@ void nfsd_localio_ops_init(void) * set. Caller (NFS client) is responsible for calling nfsd_net_put and * nfsd_file_put (via nfs_to_nfsd_file_put_local). */ -struct nfsd_file * +static struct nfsd_file * nfsd_open_local_fh(struct net *net, struct auth_domain *dom, struct rpc_clnt *rpc_clnt, const struct cred *cred, - const struct nfs_fh *nfs_fh, const fmode_t fmode) + const struct nfs_fh *nfs_fh, struct nfsd_file __rcu **pnf, + const fmode_t fmode) { int mayflags = NFSD_MAY_LOCALIO; struct svc_cred rq_cred; @@ -71,6 +58,15 @@ nfsd_open_local_fh(struct net *net, struct auth_domain *dom, if (nfs_fh->size > NFS4_FHSIZE) return ERR_PTR(-EINVAL); + if (!nfsd_net_try_get(net)) + return ERR_PTR(-ENXIO); + + rcu_read_lock(); + localio = nfsd_file_get(rcu_dereference(*pnf)); + rcu_read_unlock(); + if (localio) + return localio; + /* nfs_fh -> svc_fh */ fh_init(&fh, NFS4_FHSIZE); fh.fh_handle.fh_size = nfs_fh->size; @@ -92,9 +88,47 @@ nfsd_open_local_fh(struct net *net, struct auth_domain *dom, if (rq_cred.cr_group_info) put_group_info(rq_cred.cr_group_info); + if (!IS_ERR(localio)) { + struct nfsd_file *new; + if (!nfsd_net_try_get(net)) { + nfsd_file_put(localio); + nfsd_net_put(net); + return ERR_PTR(-ENXIO); + } + nfsd_file_get(localio); + again: + new = unrcu_pointer(cmpxchg(pnf, NULL, RCU_INITIALIZER(localio))); + if (new) { + /* Some other thread installed an nfsd_file */ + if (nfsd_file_get(new) == NULL) + goto again; + /* + * Drop the ref we were going to install and the + * one we were going to return. + */ + nfsd_file_put(localio); + nfsd_file_put(localio); + localio = new; + } + } else + nfsd_net_put(net); + return localio; } -EXPORT_SYMBOL_GPL(nfsd_open_local_fh); + +static const struct nfsd_localio_operations nfsd_localio_ops = { + .nfsd_net_try_get = nfsd_net_try_get, + .nfsd_net_put = nfsd_net_put, + .nfsd_open_local_fh = nfsd_open_local_fh, + .nfsd_file_put_local = nfsd_file_put_local, + .nfsd_file_get_local = nfsd_file_get_local, + .nfsd_file_file = nfsd_file_file, +}; + +void nfsd_localio_ops_init(void) +{ + nfs_to = &nfsd_localio_ops; +} /* * UUID_IS_LOCAL XDR functions diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index a817d8485d21..b6d03e1ef5f7 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -561,7 +561,7 @@ static void nfsd3_init_dirlist_pages(struct svc_rqst *rqstp, buf->pages = rqstp->rq_next_page; rqstp->rq_next_page += (buf->buflen + PAGE_SIZE - 1) >> PAGE_SHIFT; - xdr_init_encode_pages(xdr, buf, buf->pages, NULL); + xdr_init_encode_pages(xdr, buf); } /* diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index ccb00aa93be0..e00b2aea8da2 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1409,6 +1409,7 @@ void nfsd41_cb_referring_call(struct nfsd4_callback *cb, out: if (!rcl->__nr_referring_calls) { cb->cb_nr_referring_call_list--; + list_del(&rcl->__list); kfree(rcl); } } diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 290271ac4245..aea905fcaf87 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -65,7 +65,7 @@ nfsd4_alloc_devid_map(const struct svc_fh *fhp) return; map->fsid_type = fh->fh_fsid_type; - memcpy(&map->fsid, fh->fh_fsid, fsid_len); + memcpy(&map->fsid, fh_fsid(fh), fsid_len); spin_lock(&nfsd_devid_lock); if (fhp->fh_export->ex_devid_map) @@ -75,7 +75,7 @@ nfsd4_alloc_devid_map(const struct svc_fh *fhp) list_for_each_entry(old, &nfsd_devid_hash[i], hash) { if (old->fsid_type != fh->fh_fsid_type) continue; - if (memcmp(old->fsid, fh->fh_fsid, + if (memcmp(old->fsid, fh_fsid(fh), key_len(old->fsid_type))) continue; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index f13abbb13b38..71b428efcbb5 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1917,13 +1917,6 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd42_write_res *result; __be32 status; - /* - * Currently, async COPY is not reliable. Force all COPY - * requests to be synchronous to avoid client application - * hangs waiting for COPY completion. - */ - nfsd4_copy_set_sync(copy, true); - result = ©->cp_res; nfsd_copy_write_verifier((__be32 *)&result->wr_verifier.data, nn); @@ -2842,20 +2835,10 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) rqstp->rq_lease_breaker = (void **)&cstate->clp; - trace_nfsd_compound(rqstp, args->tag, args->taglen, args->client_opcnt); + trace_nfsd_compound(rqstp, args->tag, args->taglen, args->opcnt); while (!status && resp->opcnt < args->opcnt) { op = &args->ops[resp->opcnt++]; - if (unlikely(resp->opcnt == NFSD_MAX_OPS_PER_COMPOUND)) { - /* If there are still more operations to process, - * stop here and report NFS4ERR_RESOURCE. */ - if (cstate->minorversion == 0 && - args->client_opcnt > resp->opcnt) { - op->status = nfserr_resource; - goto encode_op; - } - } - /* * The XDR decode routines may have pre-set op->status; * for example, if there is a miscellaneous XDR error @@ -2932,7 +2915,7 @@ encode_op: status = op->status; } - trace_nfsd_compound_status(args->client_opcnt, resp->opcnt, + trace_nfsd_compound_status(args->opcnt, resp->opcnt, status, nfsd4_op_name(op->opnum)); nfsd4_cstate_clear_replay(cstate); diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 82785db730d9..2231192ec33f 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -950,38 +950,32 @@ static const struct rpc_pipe_ops cld_upcall_ops = { .destroy_msg = cld_pipe_destroy_msg, }; -static struct dentry * +static int nfsd4_cld_register_sb(struct super_block *sb, struct rpc_pipe *pipe) { - struct dentry *dir, *dentry; + struct dentry *dir; + int err; dir = rpc_d_lookup_sb(sb, NFSD_PIPE_DIR); if (dir == NULL) - return ERR_PTR(-ENOENT); - dentry = rpc_mkpipe_dentry(dir, NFSD_CLD_PIPE, NULL, pipe); + return -ENOENT; + err = rpc_mkpipe_dentry(dir, NFSD_CLD_PIPE, NULL, pipe); dput(dir); - return dentry; + return err; } -static void -nfsd4_cld_unregister_sb(struct rpc_pipe *pipe) -{ - if (pipe->dentry) - rpc_unlink(pipe->dentry); -} - -static struct dentry * +static int nfsd4_cld_register_net(struct net *net, struct rpc_pipe *pipe) { struct super_block *sb; - struct dentry *dentry; + int err; sb = rpc_get_sb_net(net); if (!sb) - return NULL; - dentry = nfsd4_cld_register_sb(sb, pipe); + return 0; + err = nfsd4_cld_register_sb(sb, pipe); rpc_put_sb_net(net); - return dentry; + return err; } static void @@ -991,7 +985,7 @@ nfsd4_cld_unregister_net(struct net *net, struct rpc_pipe *pipe) sb = rpc_get_sb_net(net); if (sb) { - nfsd4_cld_unregister_sb(pipe); + rpc_unlink(pipe); rpc_put_sb_net(net); } } @@ -1001,7 +995,6 @@ static int __nfsd4_init_cld_pipe(struct net *net) { int ret; - struct dentry *dentry; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct cld_net *cn; @@ -1022,13 +1015,10 @@ __nfsd4_init_cld_pipe(struct net *net) spin_lock_init(&cn->cn_lock); INIT_LIST_HEAD(&cn->cn_list); - dentry = nfsd4_cld_register_net(net, cn->cn_pipe); - if (IS_ERR(dentry)) { - ret = PTR_ERR(dentry); + ret = nfsd4_cld_register_net(net, cn->cn_pipe); + if (unlikely(ret)) goto err_destroy_data; - } - cn->cn_pipe->dentry = dentry; #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING cn->cn_has_legacy = false; #endif @@ -2121,7 +2111,6 @@ rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr) struct net *net = sb->s_fs_info; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct cld_net *cn = nn->cld_net; - struct dentry *dentry; int ret = 0; if (!try_module_get(THIS_MODULE)) @@ -2134,16 +2123,10 @@ rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr) switch (event) { case RPC_PIPEFS_MOUNT: - dentry = nfsd4_cld_register_sb(sb, cn->cn_pipe); - if (IS_ERR(dentry)) { - ret = PTR_ERR(dentry); - break; - } - cn->cn_pipe->dentry = dentry; + ret = nfsd4_cld_register_sb(sb, cn->cn_pipe); break; case RPC_PIPEFS_UMOUNT: - if (cn->cn_pipe->dentry) - nfsd4_cld_unregister_sb(cn->cn_pipe); + rpc_unlink(cn->cn_pipe); break; default: ret = -ENOTSUPP; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d5694987f86f..88c347957da5 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -633,18 +633,6 @@ find_readable_file(struct nfs4_file *f) return ret; } -static struct nfsd_file * -find_rw_file(struct nfs4_file *f) -{ - struct nfsd_file *ret; - - spin_lock(&f->fi_lock); - ret = nfsd_file_get(f->fi_fds[O_RDWR]); - spin_unlock(&f->fi_lock); - - return ret; -} - struct nfsd_file * find_any_file(struct nfs4_file *f) { @@ -1218,15 +1206,20 @@ nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid) static void put_deleg_file(struct nfs4_file *fp) { + struct nfsd_file *rnf = NULL; struct nfsd_file *nf = NULL; spin_lock(&fp->fi_lock); - if (--fp->fi_delegees == 0) + if (--fp->fi_delegees == 0) { swap(nf, fp->fi_deleg_file); + swap(rnf, fp->fi_rdeleg_file); + } spin_unlock(&fp->fi_lock); if (nf) nfsd_file_put(nf); + if (rnf) + nfs4_file_put_access(fp, NFS4_SHARE_ACCESS_READ); } static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp) @@ -3872,7 +3865,6 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs ca->headerpadsz = 0; ca->maxreq_sz = min_t(u32, ca->maxreq_sz, maxrpc); ca->maxresp_sz = min_t(u32, ca->maxresp_sz, maxrpc); - ca->maxops = min_t(u32, ca->maxops, NFSD_MAX_OPS_PER_COMPOUND); ca->maxresp_cached = min_t(u32, ca->maxresp_cached, NFSD_SLOT_CACHE_SIZE + NFSD_MIN_HDR_SEQ_SZ); ca->maxreqs = min_t(u32, ca->maxreqs, NFSD_MAX_SLOTS_PER_SESSION); @@ -4697,10 +4689,16 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, } status = nfs_ok; if (conf) { - old = unconf; - unhash_client_locked(old); - nfsd4_change_callback(conf, &unconf->cl_cb_conn); - } else { + if (get_client_locked(conf) == nfs_ok) { + old = unconf; + unhash_client_locked(old); + nfsd4_change_callback(conf, &unconf->cl_cb_conn); + } else { + conf = NULL; + } + } + + if (!conf) { old = find_confirmed_client_by_name(&unconf->cl_name, nn); if (old) { status = nfserr_clid_inuse; @@ -4717,10 +4715,14 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, } trace_nfsd_clid_replaced(&old->cl_clientid); } + status = get_client_locked(unconf); + if (status != nfs_ok) { + old = NULL; + goto out; + } move_to_confirmed(unconf); conf = unconf; } - get_client_locked(conf); spin_unlock(&nn->client_lock); if (conf == unconf) fsnotify_dentry(conf->cl_nfsd_info_dentry, FS_MODIFY); @@ -4750,6 +4752,7 @@ static void nfsd4_file_init(const struct svc_fh *fh, struct nfs4_file *fp) INIT_LIST_HEAD(&fp->fi_clnt_odstate); fh_copy_shallow(&fp->fi_fhandle, &fh->fh_handle); fp->fi_deleg_file = NULL; + fp->fi_rdeleg_file = NULL; fp->fi_had_conflict = false; fp->fi_share_deny = 0; memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); @@ -6000,14 +6003,19 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, * "An OPEN_DELEGATE_WRITE delegation allows the client to handle, * on its own, all opens." * - * Furthermore the client can use a write delegation for most READ - * operations as well, so we require a O_RDWR file here. + * Furthermore, section 9.1.2 says: + * + * "In the case of READ, the server may perform the corresponding + * check on the access mode, or it may choose to allow READ for + * OPEN4_SHARE_ACCESS_WRITE, to accommodate clients whose WRITE + * implementation may unavoidably do reads (e.g., due to buffer + * cache constraints)." * - * Offer a write delegation in the case of a BOTH open, and ensure - * we get the O_RDWR descriptor. + * We choose to offer a write delegation for OPEN with the + * OPEN4_SHARE_ACCESS_WRITE access mode to accommodate such clients. */ - if ((open->op_share_access & NFS4_SHARE_ACCESS_BOTH) == NFS4_SHARE_ACCESS_BOTH) { - nf = find_rw_file(fp); + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) { + nf = find_writeable_file(fp); dl_type = deleg_ts ? OPEN_DELEGATE_WRITE_ATTRS_DELEG : OPEN_DELEGATE_WRITE; } @@ -6138,7 +6146,7 @@ static bool nfs4_delegation_stat(struct nfs4_delegation *dp, struct svc_fh *currentfh, struct kstat *stat) { - struct nfsd_file *nf = find_rw_file(dp->dl_stid.sc_file); + struct nfsd_file *nf = find_writeable_file(dp->dl_stid.sc_file); struct path path; int rc; @@ -6157,6 +6165,34 @@ nfs4_delegation_stat(struct nfs4_delegation *dp, struct svc_fh *currentfh, } /* + * Add NFS4_SHARE_ACCESS_READ to the write delegation granted on OPEN + * with NFS4_SHARE_ACCESS_WRITE by allocating separate nfsd_file and + * struct file to be used for read with delegation stateid. + * + */ +static bool +nfsd4_add_rdaccess_to_wrdeleg(struct svc_rqst *rqstp, struct nfsd4_open *open, + struct svc_fh *fh, struct nfs4_ol_stateid *stp) +{ + struct nfs4_file *fp; + struct nfsd_file *nf = NULL; + + if ((open->op_share_access & NFS4_SHARE_ACCESS_BOTH) == + NFS4_SHARE_ACCESS_WRITE) { + if (nfsd_file_acquire_opened(rqstp, fh, NFSD_MAY_READ, NULL, &nf)) + return (false); + fp = stp->st_stid.sc_file; + spin_lock(&fp->fi_lock); + __nfs4_file_get_access(fp, NFS4_SHARE_ACCESS_READ); + fp = stp->st_stid.sc_file; + fp->fi_fds[O_RDONLY] = nf; + fp->fi_rdeleg_file = nf; + spin_unlock(&fp->fi_lock); + } + return true; +} + +/* * The Linux NFS server does not offer write delegations to NFSv4.0 * clients in order to avoid conflicts between write delegations and * GETATTRs requesting CHANGE or SIZE attributes. @@ -6181,8 +6217,9 @@ nfs4_delegation_stat(struct nfs4_delegation *dp, struct svc_fh *currentfh, * open or lock state. */ static void -nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, - struct svc_fh *currentfh) +nfs4_open_delegation(struct svc_rqst *rqstp, struct nfsd4_open *open, + struct nfs4_ol_stateid *stp, struct svc_fh *currentfh, + struct svc_fh *fh) { struct nfs4_openowner *oo = openowner(stp->st_stateowner); bool deleg_ts = nfsd4_want_deleg_timestamps(open); @@ -6227,7 +6264,8 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid)); if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) { - if (!nfs4_delegation_stat(dp, currentfh, &stat)) { + if (!nfsd4_add_rdaccess_to_wrdeleg(rqstp, open, fh, stp) || + !nfs4_delegation_stat(dp, currentfh, &stat)) { nfs4_put_stid(&dp->dl_stid); destroy_delegation(dp); goto out_no_deleg; @@ -6322,6 +6360,20 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs4_check_deleg(cl, open, &dp); if (status) goto out; + if (dp && nfsd4_is_deleg_cur(open) && + (dp->dl_stid.sc_file != fp)) { + /* + * RFC8881 section 8.2.4 mandates the server to return + * NFS4ERR_BAD_STATEID if the selected table entry does + * not match the current filehandle. However returning + * NFS4ERR_BAD_STATEID in the OPEN can cause the client + * to repeatedly retry the operation with the same + * stateid, since the stateid itself is valid. To avoid + * this situation NFSD returns NFS4ERR_INVAL instead. + */ + status = nfserr_inval; + goto out; + } stp = nfsd4_find_and_lock_existing_open(fp, open); } else { open->op_file = NULL; @@ -6383,7 +6435,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf * Attempt to hand out a delegation. No error return, because the * OPEN succeeds even if we fail. */ - nfs4_open_delegation(open, stp, &resp->cstate.current_fh); + nfs4_open_delegation(rqstp, open, stp, + &resp->cstate.current_fh, current_fh); /* * If there is an existing open stateid, it must be updated and @@ -7076,7 +7129,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, return_revoked = true; if (typemask & SC_TYPE_DELEG) /* Always allow REVOKED for DELEG so we can - * retturn the appropriate error. + * return the appropriate error. */ statusmask |= SC_STATUS_REVOKED; @@ -7119,10 +7172,6 @@ nfs4_find_file(struct nfs4_stid *s, int flags) switch (s->sc_type) { case SC_TYPE_DELEG: - spin_lock(&s->sc_file->fi_lock); - ret = nfsd_file_get(s->sc_file->fi_deleg_file); - spin_unlock(&s->sc_file->fi_lock); - break; case SC_TYPE_OPEN: case SC_TYPE_LOCK: if (flags & RD_STATE) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 3afcdbed6e14..ea91bad4eee2 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2500,10 +2500,8 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) if (xdr_stream_decode_u32(argp->xdr, &argp->minorversion) < 0) return false; - if (xdr_stream_decode_u32(argp->xdr, &argp->client_opcnt) < 0) + if (xdr_stream_decode_u32(argp->xdr, &argp->opcnt) < 0) return false; - argp->opcnt = min_t(u32, argp->client_opcnt, - NFSD_MAX_OPS_PER_COMPOUND); if (argp->opcnt > ARRAY_SIZE(argp->iops)) { argp->ops = vcalloc(argp->opcnt, sizeof(*argp->ops)); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 3f3e9f6c4250..bc6b776fc657 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1436,7 +1436,7 @@ unsigned int nfsd_net_id; static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb, struct netlink_callback *cb, - struct nfsd_genl_rqstp *rqstp) + struct nfsd_genl_rqstp *genl_rqstp) { void *hdr; u32 i; @@ -1446,22 +1446,22 @@ static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb, if (!hdr) return -ENOBUFS; - if (nla_put_be32(skb, NFSD_A_RPC_STATUS_XID, rqstp->rq_xid) || - nla_put_u32(skb, NFSD_A_RPC_STATUS_FLAGS, rqstp->rq_flags) || - nla_put_u32(skb, NFSD_A_RPC_STATUS_PROG, rqstp->rq_prog) || - nla_put_u32(skb, NFSD_A_RPC_STATUS_PROC, rqstp->rq_proc) || - nla_put_u8(skb, NFSD_A_RPC_STATUS_VERSION, rqstp->rq_vers) || + if (nla_put_be32(skb, NFSD_A_RPC_STATUS_XID, genl_rqstp->rq_xid) || + nla_put_u32(skb, NFSD_A_RPC_STATUS_FLAGS, genl_rqstp->rq_flags) || + nla_put_u32(skb, NFSD_A_RPC_STATUS_PROG, genl_rqstp->rq_prog) || + nla_put_u32(skb, NFSD_A_RPC_STATUS_PROC, genl_rqstp->rq_proc) || + nla_put_u8(skb, NFSD_A_RPC_STATUS_VERSION, genl_rqstp->rq_vers) || nla_put_s64(skb, NFSD_A_RPC_STATUS_SERVICE_TIME, - ktime_to_us(rqstp->rq_stime), + ktime_to_us(genl_rqstp->rq_stime), NFSD_A_RPC_STATUS_PAD)) return -ENOBUFS; - switch (rqstp->rq_saddr.sa_family) { + switch (genl_rqstp->rq_saddr.sa_family) { case AF_INET: { const struct sockaddr_in *s_in, *d_in; - s_in = (const struct sockaddr_in *)&rqstp->rq_saddr; - d_in = (const struct sockaddr_in *)&rqstp->rq_daddr; + s_in = (const struct sockaddr_in *)&genl_rqstp->rq_saddr; + d_in = (const struct sockaddr_in *)&genl_rqstp->rq_daddr; if (nla_put_in_addr(skb, NFSD_A_RPC_STATUS_SADDR4, s_in->sin_addr.s_addr) || nla_put_in_addr(skb, NFSD_A_RPC_STATUS_DADDR4, @@ -1476,8 +1476,8 @@ static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb, case AF_INET6: { const struct sockaddr_in6 *s_in, *d_in; - s_in = (const struct sockaddr_in6 *)&rqstp->rq_saddr; - d_in = (const struct sockaddr_in6 *)&rqstp->rq_daddr; + s_in = (const struct sockaddr_in6 *)&genl_rqstp->rq_saddr; + d_in = (const struct sockaddr_in6 *)&genl_rqstp->rq_daddr; if (nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_SADDR6, &s_in->sin6_addr) || nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_DADDR6, @@ -1491,9 +1491,9 @@ static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb, } } - for (i = 0; i < rqstp->rq_opcnt; i++) + for (i = 0; i < genl_rqstp->rq_opcnt; i++) if (nla_put_u32(skb, NFSD_A_RPC_STATUS_COMPOUND_OPS, - rqstp->rq_opnum[i])) + genl_rqstp->rq_opnum[i])) return -ENOBUFS; genlmsg_end(skb, hdr); @@ -1569,7 +1569,8 @@ int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb, int j; args = rqstp->rq_argp; - genl_rqstp.rq_opcnt = args->opcnt; + genl_rqstp.rq_opcnt = min_t(u32, args->opcnt, + ARRAY_SIZE(genl_rqstp.rq_opnum)); for (j = 0; j < genl_rqstp.rq_opcnt; j++) genl_rqstp.rq_opnum[j] = args->ops[j].opnum; @@ -1611,7 +1612,7 @@ out_unlock: */ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) { - int *nthreads, count = 0, nrpools, i, ret = -EOPNOTSUPP, rem; + int *nthreads, nrpools = 0, i, ret = -EOPNOTSUPP, rem; struct net *net = genl_info_net(info); struct nfsd_net *nn = net_generic(net, nfsd_net_id); const struct nlattr *attr; @@ -1621,14 +1622,12 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) return -EINVAL; /* count number of SERVER_THREADS values */ - nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { - if (nla_type(attr) == NFSD_A_SERVER_THREADS) - count++; - } + nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_THREADS, info->nlhdr, + GENL_HDRLEN, rem) + nrpools++; mutex_lock(&nfsd_mutex); - nrpools = max(count, nfsd_nrpools(net)); nthreads = kcalloc(nrpools, sizeof(int), GFP_KERNEL); if (!nthreads) { ret = -ENOMEM; @@ -1636,12 +1635,11 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) } i = 0; - nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { - if (nla_type(attr) == NFSD_A_SERVER_THREADS) { - nthreads[i++] = nla_get_u32(attr); - if (i >= nrpools) - break; - } + nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_THREADS, info->nlhdr, + GENL_HDRLEN, rem) { + nthreads[i++] = nla_get_u32(attr); + if (i >= nrpools) + break; } if (info->attrs[NFSD_A_SERVER_GRACETIME] || @@ -1782,14 +1780,12 @@ int nfsd_nl_version_set_doit(struct sk_buff *skb, struct genl_info *info) for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) nfsd_minorversion(nn, i, NFSD_CLEAR); - nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { + nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_PROTO_VERSION, info->nlhdr, + GENL_HDRLEN, rem) { struct nlattr *tb[NFSD_A_VERSION_MAX + 1]; u32 major, minor = 0; bool enabled; - if (nla_type(attr) != NFSD_A_SERVER_PROTO_VERSION) - continue; - if (nla_parse_nested(tb, NFSD_A_VERSION_MAX, attr, nfsd_version_nl_policy, info->extack) < 0) continue; @@ -1940,14 +1936,12 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) * Walk the list of server_socks from userland and move any that match * back to sv_permsocks */ - nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { + nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr, + GENL_HDRLEN, rem) { struct nlattr *tb[NFSD_A_SOCK_MAX + 1]; const char *xcl_name; struct sockaddr *sa; - if (nla_type(attr) != NFSD_A_SERVER_SOCK_ADDR) - continue; - if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr, nfsd_sock_nl_policy, info->extack) < 0) continue; @@ -2002,15 +1996,13 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) svc_xprt_destroy_all(serv, net); /* walk list of addrs again, open any that still don't exist */ - nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { + nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr, + GENL_HDRLEN, rem) { struct nlattr *tb[NFSD_A_SOCK_MAX + 1]; const char *xcl_name; struct sockaddr *sa; int ret; - if (nla_type(attr) != NFSD_A_SERVER_SOCK_ADDR) - continue; - if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr, nfsd_sock_nl_policy, info->extack) < 0) continue; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 1bfd0b4e9af7..1cd0bed57bc2 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -57,9 +57,6 @@ struct readdir_cd { __be32 err; /* 0, nfserr, or nfserr_eof */ }; -/* Maximum number of operations per session compound */ -#define NFSD_MAX_OPS_PER_COMPOUND 50 - struct nfsd_genl_rqstp { struct sockaddr rq_daddr; struct sockaddr rq_saddr; @@ -72,7 +69,7 @@ struct nfsd_genl_rqstp { /* NFSv4 compound */ u32 rq_opcnt; - u32 rq_opnum[NFSD_MAX_OPS_PER_COMPOUND]; + u32 rq_opnum[16]; }; extern struct svc_program nfsd_programs[]; @@ -283,6 +280,7 @@ void nfsd_lockd_shutdown(void); #define nfserr_cb_path_down cpu_to_be32(NFSERR_CB_PATH_DOWN) #define nfserr_locked cpu_to_be32(NFSERR_LOCKED) #define nfserr_wrongsec cpu_to_be32(NFSERR_WRONGSEC) +#define nfserr_delay cpu_to_be32(NFS4ERR_DELAY) #define nfserr_badiomode cpu_to_be32(NFS4ERR_BADIOMODE) #define nfserr_badlayout cpu_to_be32(NFS4ERR_BADLAYOUT) #define nfserr_bad_session_digest cpu_to_be32(NFS4ERR_BAD_SESSION_DIGEST) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index aef474f1b84b..74cf1f4de174 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -172,6 +172,8 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net, if (len == 0) return error; if (fh->fh_fsid_type == FSID_MAJOR_MINOR) { + u32 *fsid = fh_fsid(fh); + /* deprecated, convert to type 3 */ len = key_len(FSID_ENCODE_DEV)/4; fh->fh_fsid_type = FSID_ENCODE_DEV; @@ -181,17 +183,17 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net, * confuses sparse, so we must use __force here to * keep it from complaining. */ - fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]), - ntohl((__force __be32)fh->fh_fsid[1]))); - fh->fh_fsid[1] = fh->fh_fsid[2]; + fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fsid[0]), + ntohl((__force __be32)fsid[1]))); + fsid[1] = fsid[2]; } data_left -= len; if (data_left < 0) return error; exp = rqst_exp_find(rqstp ? &rqstp->rq_chandle : NULL, net, client, gssclient, - fh->fh_fsid_type, fh->fh_fsid); - fid = (struct fid *)(fh->fh_fsid + len); + fh->fh_fsid_type, fh_fsid(fh)); + fid = (struct fid *)(fh_fsid(fh) + len); error = nfserr_stale; if (IS_ERR(exp)) { @@ -463,7 +465,7 @@ static void _fh_update(struct svc_fh *fhp, struct svc_export *exp, { if (dentry != exp->ex_path.dentry) { struct fid *fid = (struct fid *) - (fhp->fh_handle.fh_fsid + fhp->fh_handle.fh_size/4 - 1); + (fh_fsid(&fhp->fh_handle) + fhp->fh_handle.fh_size/4 - 1); int maxsize = (fhp->fh_maxsize - fhp->fh_handle.fh_size)/4; int fh_flags = (exp->ex_flags & NFSEXP_NOSUBTREECHECK) ? 0 : EXPORT_FH_CONNECTABLE; @@ -614,7 +616,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, fhp->fh_handle.fh_auth_type = 0; mk_fsid(fhp->fh_handle.fh_fsid_type, - fhp->fh_handle.fh_fsid, + fh_fsid(&fhp->fh_handle), ex_dev, d_inode(exp->ex_path.dentry)->i_ino, exp->ex_fsid, exp->ex_uuid); diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 5103c2f4d225..1cf979722521 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -49,18 +49,19 @@ struct knfsd_fh { * Points to the current size while * building a new file handle. */ - union { - char fh_raw[NFS4_FHSIZE]; - struct { - u8 fh_version; /* == 1 */ - u8 fh_auth_type; /* deprecated */ - u8 fh_fsid_type; - u8 fh_fileid_type; - u32 fh_fsid[]; /* flexible-array member */ - }; - }; + u8 fh_raw[NFS4_FHSIZE]; }; +#define fh_version fh_raw[0] +#define fh_auth_type fh_raw[1] +#define fh_fsid_type fh_raw[2] +#define fh_fileid_type fh_raw[3] + +static inline u32 *fh_fsid(const struct knfsd_fh *fh) +{ + return (u32 *)&fh->fh_raw[4]; +} + static inline __u32 ino_t_to_u32(ino_t ino) { return (__u32) ino; @@ -260,9 +261,12 @@ static inline bool fh_match(const struct knfsd_fh *fh1, static inline bool fh_fsid_match(const struct knfsd_fh *fh1, const struct knfsd_fh *fh2) { + u32 *fsid1 = fh_fsid(fh1); + u32 *fsid2 = fh_fsid(fh2); + if (fh1->fh_fsid_type != fh2->fh_fsid_type) return false; - if (memcmp(fh1->fh_fsid, fh2->fh_fsid, key_len(fh1->fh_fsid_type)) != 0) + if (memcmp(fsid1, fsid2, key_len(fh1->fh_fsid_type)) != 0) return false; return true; } diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index c10fa8128a8a..8f71f5748c75 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -575,7 +575,7 @@ static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp, buf->pages = rqstp->rq_next_page; rqstp->rq_next_page++; - xdr_init_encode_pages(xdr, buf, buf->pages, NULL); + xdr_init_encode_pages(xdr, buf); } /* diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 1995bca158b8..8adc2550129e 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -665,6 +665,7 @@ struct nfs4_file { atomic_t fi_access[2]; u32 fi_share_deny; struct nfsd_file *fi_deleg_file; + struct nfsd_file *fi_rdeleg_file; int fi_delegees; struct knfsd_fh fi_fhandle; bool fi_had_conflict; diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 3c5505ef5e3a..a664fdf1161e 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -344,7 +344,7 @@ TRACE_EVENT(nfsd_exp_find_key, int status), TP_ARGS(key, status), TP_STRUCT__entry( - __field(int, fsidtype) + __field(u8, fsidtype) __array(u32, fsid, 6) __string(auth_domain, key->ek_client->name) __field(int, status) @@ -367,7 +367,7 @@ TRACE_EVENT(nfsd_expkey_update, TP_PROTO(const struct svc_expkey *key, const char *exp_path), TP_ARGS(key, exp_path), TP_STRUCT__entry( - __field(int, fsidtype) + __field(u8, fsidtype) __array(u32, fsid, 6) __string(auth_domain, key->ek_client->name) __string(path, exp_path) @@ -1108,7 +1108,6 @@ DEFINE_NFSD_FILE_EVENT(nfsd_file_free); DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash); DEFINE_NFSD_FILE_EVENT(nfsd_file_put); DEFINE_NFSD_FILE_EVENT(nfsd_file_closing); -DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_queue); TRACE_EVENT(nfsd_file_alloc, TP_PROTO( @@ -1344,9 +1343,7 @@ DEFINE_EVENT(nfsd_file_gc_class, name, \ TP_ARGS(nf)) DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_add); -DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_add_disposed); DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del); -DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del_disposed); DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_in_use); DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_writeback); DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_referenced); @@ -1380,7 +1377,6 @@ DEFINE_EVENT(nfsd_file_lruwalk_class, name, \ TP_ARGS(removed, remaining)) DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_gc_removed); -DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_gc_recent); DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_shrinker_removed); TRACE_EVENT(nfsd_file_close, @@ -2103,25 +2099,6 @@ TRACE_EVENT(nfsd_ctl_maxblksize, ) ); -TRACE_EVENT(nfsd_ctl_maxconn, - TP_PROTO( - const struct net *net, - int maxconn - ), - TP_ARGS(net, maxconn), - TP_STRUCT__entry( - __field(unsigned int, netns_ino) - __field(int, maxconn) - ), - TP_fast_assign( - __entry->netns_ino = net->ns.inum; - __entry->maxconn = maxconn; - ), - TP_printk("maxconn=%d", - __entry->maxconn - ) -); - TRACE_EVENT(nfsd_ctl_time, TP_PROTO( const struct net *net, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index cd689df2ca5d..98ab55ba3ced 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1086,10 +1086,13 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, { unsigned long v, total; struct iov_iter iter; - loff_t ppos = offset; + struct kiocb kiocb; ssize_t host_err; size_t len; + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = offset; + v = 0; total = *count; while (total) { @@ -1104,7 +1107,7 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, trace_nfsd_read_vector(rqstp, fhp, offset, *count); iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, *count); - host_err = vfs_iter_read(file, &iter, &ppos, 0); + host_err = vfs_iocb_iter_read(file, &kiocb, &iter); return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } @@ -1170,15 +1173,14 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct file *file = nf->nf_file; struct super_block *sb = file_inode(file)->i_sb; + struct kiocb kiocb; struct svc_export *exp; struct iov_iter iter; errseq_t since; __be32 nfserr; int host_err; - loff_t pos = offset; unsigned long exp_op_flags = 0; unsigned int pflags = current->flags; - rwf_t flags = 0; bool restore_flags = false; unsigned int nvecs; @@ -1204,16 +1206,17 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, if (!EX_ISSYNC(exp)) stable = NFS_UNSTABLE; - + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = offset; if (stable && !fhp->fh_use_wgather) - flags |= RWF_SYNC; + kiocb.ki_flags |= IOCB_DSYNC; nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload); iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt); since = READ_ONCE(file->f_wb_err); if (verf) nfsd_copy_write_verifier(verf, nn); - host_err = vfs_iter_write(file, &iter, &pos, flags); + host_err = vfs_iocb_iter_write(file, &kiocb, &iter); if (host_err < 0) { commit_reset_write_verifier(nn, rqstp, host_err); goto out_nfserr; @@ -1864,7 +1867,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, struct svc_fh *tfhp, char *tname, int tlen) { struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap; - struct inode *fdir, *tdir; int type = S_IFDIR; __be32 err; int host_err; @@ -1880,10 +1882,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, goto out; fdentry = ffhp->fh_dentry; - fdir = d_inode(fdentry); tdentry = tfhp->fh_dentry; - tdir = d_inode(tdentry); err = nfserr_perm; if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) @@ -1944,10 +1944,10 @@ retry: } else { struct renamedata rd = { .old_mnt_idmap = &nop_mnt_idmap, - .old_dir = fdir, + .old_parent = fdentry, .old_dentry = odentry, .new_mnt_idmap = &nop_mnt_idmap, - .new_dir = tdir, + .new_parent = tdentry, .new_dentry = ndentry, }; int retries; diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index aa2a356da784..a23bc56051ca 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -870,7 +870,6 @@ struct nfsd4_compoundargs { char * tag; u32 taglen; u32 minorversion; - u32 client_opcnt; u32 opcnt; bool splice_ok; struct nfsd4_op *ops; diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 0d8f7fb15c2e..dd0c8e560ef6 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -2102,11 +2102,13 @@ static int nilfs_btree_propagate(struct nilfs_bmap *btree, ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0); if (ret < 0) { - if (unlikely(ret == -ENOENT)) + if (unlikely(ret == -ENOENT)) { nilfs_crit(btree->b_inode->i_sb, "writing node/leaf block does not appear in b-tree (ino=%lu) at key=%llu, level=%d", btree->b_inode->i_ino, (unsigned long long)key, level); + ret = -EINVAL; + } goto out; } diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 9b7f8e9655a2..6ca3d74be1e1 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -96,7 +96,7 @@ static void nilfs_commit_chunk(struct folio *folio, int err; nr_dirty = nilfs_page_count_clean_buffers(folio, from, to); - copied = block_write_end(NULL, mapping, pos, len, len, folio, NULL); + copied = block_write_end(pos, len, len, folio); if (pos + copied > dir->i_size) i_size_write(dir, pos + copied); if (IS_DIRSYNC(dir)) diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index 893ab36824cc..2d8dc6b35b54 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -273,6 +273,9 @@ static int nilfs_direct_propagate(struct nilfs_bmap *bmap, dat = nilfs_bmap_get_dat(bmap); key = nilfs_bmap_data_get_key(bmap, bh); ptr = nilfs_direct_get_ptr(bmap, key); + if (ptr == NILFS_BMAP_INVALID_PTR) + return -EINVAL; + if (!buffer_nilfs_volatile(bh)) { oldreq.pr_entry_nr = ptr; newreq.pr_entry_nr = ptr; diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 0e3fc5ba33c7..1b8d754db44d 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -125,10 +125,10 @@ static const struct vm_operations_struct nilfs_file_vm_ops = { .page_mkwrite = nilfs_page_mkwrite, }; -static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) +static int nilfs_file_mmap_prepare(struct vm_area_desc *desc) { - file_accessed(file); - vma->vm_ops = &nilfs_file_vm_ops; + file_accessed(desc->file); + desc->vm_ops = &nilfs_file_vm_ops; return 0; } @@ -144,7 +144,7 @@ const struct file_operations nilfs_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = nilfs_compat_ioctl, #endif /* CONFIG_COMPAT */ - .mmap = nilfs_file_mmap, + .mmap_prepare = nilfs_file_mmap_prepare, .open = generic_file_open, /* .release = nilfs_release_file, */ .fsync = nilfs_sync_file, diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 6613b8fcceb0..87ddde159f0c 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -218,7 +218,8 @@ void nilfs_write_failed(struct address_space *mapping, loff_t to) } } -static int nilfs_write_begin(struct file *file, struct address_space *mapping, +static int nilfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) @@ -237,7 +238,8 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping, return err; } -static int nilfs_write_end(struct file *file, struct address_space *mapping, +static int nilfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { @@ -248,7 +250,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping, nr_dirty = nilfs_page_count_clean_buffers(folio, start, start + copied); - copied = generic_write_end(file, mapping, pos, len, copied, folio, + copied = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); nilfs_set_file_dirty(inode, nr_dirty); err = nilfs_transaction_commit(inode->i_sb); @@ -472,11 +474,18 @@ static int __nilfs_read_inode(struct super_block *sb, inode->i_op = &nilfs_symlink_inode_operations; inode_nohighmem(inode); inode->i_mapping->a_ops = &nilfs_aops; - } else { + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { inode->i_op = &nilfs_special_inode_operations; init_special_inode( inode, inode->i_mode, huge_decode_dev(le64_to_cpu(raw_inode->i_device_code))); + } else { + nilfs_error(sb, + "invalid file type bits in mode 0%o for inode %lu", + inode->i_mode, ino); + err = -EIO; + goto failed_unmap; } nilfs_ifile_unmap_inode(raw_inode); brelse(bh); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index a66d62a51f77..3288c3b4be9e 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -118,7 +118,7 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs, * * Return: always 0 as success. */ -int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int nilfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); @@ -136,7 +136,7 @@ int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) * Return: 0 on success, or a negative error code on failure. */ int nilfs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct nilfs_transaction_info ti; diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 2f850a18d6e7..946b0d3534a5 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -422,8 +422,6 @@ static int nilfs_mdt_write_folio(struct folio *folio, if (wbc->sync_mode == WB_SYNC_ALL) err = nilfs_construct_segment(sb); - else if (wbc->for_reclaim) - nilfs_flush_segment(sb, inode->i_ino); return err; } diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index cb6ed54accd7..f466daa39440 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -268,9 +268,9 @@ int nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, extern int nilfs_sync_file(struct file *, loff_t, loff_t, int); /* ioctl.c */ -int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *m); +int nilfs_fileattr_get(struct dentry *dentry, struct file_kattr *m); int nilfs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); long nilfs_ioctl(struct file *, unsigned int, unsigned long); long nilfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *, diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 22aecf6e2344..a9c61d0492cb 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -560,8 +560,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, if (unlikely(err)) goto failed_folio; - block_write_end(NULL, inode->i_mapping, pos, blocksize, - blocksize, folio, NULL); + block_write_end(pos, blocksize, blocksize, folio); folio_unlock(folio); folio_put(folio); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 83970d97840b..f15ca6fc400d 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2221,22 +2221,6 @@ static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn) spin_unlock(&sci->sc_state_lock); } -/** - * nilfs_flush_segment - trigger a segment construction for resource control - * @sb: super block - * @ino: inode number of the file to be flushed out. - */ -void nilfs_flush_segment(struct super_block *sb, ino_t ino) -{ - struct the_nilfs *nilfs = sb->s_fs_info; - struct nilfs_sc_info *sci = nilfs->ns_writer; - - if (!sci || nilfs_doing_construction()) - return; - nilfs_segctor_do_flush(sci, NILFS_MDT_INODE(sb, ino) ? ino : 0); - /* assign bit 0 to data files */ -} - struct nilfs_segctor_wait_request { wait_queue_entry_t wq; __u32 seq; @@ -2501,7 +2485,7 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode) static void nilfs_construction_timeout(struct timer_list *t) { - struct nilfs_sc_info *sci = from_timer(sci, t, sc_timer); + struct nilfs_sc_info *sci = timer_container_of(sci, t, sc_timer); wake_up_process(sci->sc_task); } diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index f723f47ddc4e..4b39ed43ae72 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -226,7 +226,6 @@ extern void nilfs_relax_pressure_in_lock(struct super_block *); extern int nilfs_construct_segment(struct super_block *); extern int nilfs_construct_dsync_segment(struct super_block *, struct inode *, loff_t, loff_t); -extern void nilfs_flush_segment(struct super_block *, ino_t); extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *, void **); diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index c4cdaf5fa7ed..9fb73bafd41d 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -308,6 +308,10 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) goto out_err; } + error = file_f_owner_allocate(filp); + if (error) + goto out_err; + /* new fsnotify mark, we expect most fcntl calls to add a new mark */ new_dn_mark = kmem_cache_alloc(dnotify_mark_cache, GFP_KERNEL); if (!new_dn_mark) { @@ -315,10 +319,6 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) goto out_err; } - error = file_f_owner_allocate(filp); - if (error) - goto out_err; - /* set up the new_fsn_mark and new_dn_mark */ new_fsn_mark = &new_dn_mark->fsn_mark; fsnotify_init_mark(new_fsn_mark, dnotify_group); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 3083643b864b..bfe884d624e7 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -454,7 +454,13 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode, dwords = fh_len >> 2; type = exportfs_encode_fid(inode, buf, &dwords); err = -EINVAL; - if (type <= 0 || type == FILEID_INVALID || fh_len != dwords << 2) + /* + * Unlike file_handle, type and len of struct fanotify_fh are u8. + * Traditionally, filesystem return handle_type < 0xff, but there + * is no enforecement for that in vfs. + */ + BUILD_BUG_ON(MAX_HANDLE_SZ > 0xff || FILEID_INVALID > 0xff); + if (type <= 0 || type >= FILEID_INVALID || fh_len != dwords << 2) goto out_err; fh->type = type; diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index e2b4f17a48bb..079b868552c2 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -199,8 +199,8 @@ static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask, } /* Are there any inode/mount/sb objects that watch for these events? */ -static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask, - __u32 mask) +static inline __u32 fsnotify_object_watched(struct inode *inode, __u32 mnt_mask, + __u32 mask) { __u32 marks_mask = READ_ONCE(inode->i_fsnotify_mask) | mnt_mask | READ_ONCE(inode->i_sb->s_fsnotify_mask); @@ -656,20 +656,20 @@ EXPORT_SYMBOL_GPL(fsnotify); #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS /* - * At open time we check fsnotify_sb_has_priority_watchers() and set the - * FMODE_NONOTIFY_ mode bits accordignly. + * At open time we check fsnotify_sb_has_priority_watchers(), call the open perm + * hook and set the FMODE_NONOTIFY_ mode bits accordignly. * Later, fsnotify permission hooks do not check if there are permission event * watches, but that there were permission event watches at open time. */ -void file_set_fsnotify_mode_from_watchers(struct file *file) +int fsnotify_open_perm_and_set_mode(struct file *file) { struct dentry *dentry = file->f_path.dentry, *parent; struct super_block *sb = dentry->d_sb; - __u32 mnt_mask, p_mask; + __u32 mnt_mask, p_mask = 0; /* Is it a file opened by fanotify? */ if (FMODE_FSNOTIFY_NONE(file->f_mode)) - return; + return 0; /* * Permission events is a super set of pre-content events, so if there @@ -679,45 +679,64 @@ void file_set_fsnotify_mode_from_watchers(struct file *file) if (likely(!fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT))) { file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); - return; + return 0; } /* - * If there are permission event watchers but no pre-content event - * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that. + * OK, there are some permission event watchers. Check if anybody is + * watching for permission events on *this* file. */ - if ((!d_is_dir(dentry) && !d_is_reg(dentry)) || - likely(!fsnotify_sb_has_priority_watchers(sb, - FSNOTIFY_PRIO_PRE_CONTENT))) { - file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM); - return; + mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask); + p_mask = fsnotify_object_watched(d_inode(dentry), mnt_mask, + ALL_FSNOTIFY_PERM_EVENTS); + if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) { + parent = dget_parent(dentry); + p_mask |= fsnotify_inode_watches_children(d_inode(parent)); + dput(parent); } /* - * OK, there are some pre-content watchers. Check if anybody is - * watching for pre-content events on *this* file. + * Legacy FAN_ACCESS_PERM events have very high performance overhead, + * so unlikely to be used in the wild. If they are used there will be + * no optimizations at all. */ - mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask); - if (unlikely(fsnotify_object_watched(d_inode(dentry), mnt_mask, - FSNOTIFY_PRE_CONTENT_EVENTS))) { - /* Enable pre-content events */ + if (unlikely(p_mask & FS_ACCESS_PERM)) { + /* Enable all permission and pre-content events */ file_set_fsnotify_mode(file, 0); - return; + goto open_perm; } - /* Is parent watching for pre-content events on this file? */ - if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) { - parent = dget_parent(dentry); - p_mask = fsnotify_inode_watches_children(d_inode(parent)); - dput(parent); - if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS) { - /* Enable pre-content events */ - file_set_fsnotify_mode(file, 0); - return; - } + /* + * Pre-content events are only supported on regular files. + * If there are pre-content event watchers and no permission access + * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that. + * That is the common case with HSM service. + */ + if (d_is_reg(dentry) && (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS)) { + file_set_fsnotify_mode(file, FMODE_NONOTIFY | + FMODE_NONOTIFY_PERM); + goto open_perm; } - /* Nobody watching for pre-content events from this file */ - file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM); + + /* Nobody watching permission and pre-content events on this file */ + file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); + +open_perm: + /* + * Send open perm events depending on object masks and regardless of + * FMODE_NONOTIFY_PERM. + */ + if (file->f_flags & __FMODE_EXEC && p_mask & FS_OPEN_EXEC_PERM) { + int ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM); + + if (ret) + return ret; + } + + if (p_mask & FS_OPEN_PERM) + return fsnotify_path(&file->f_path, FS_OPEN_PERM); + + return 0; } #endif diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c index b6da80c69ca6..1b5c865a0339 100644 --- a/fs/ntfs3/dir.c +++ b/fs/ntfs3/dir.c @@ -304,6 +304,9 @@ static inline bool ntfs_dir_emit(struct ntfs_sb_info *sbi, if (sbi->options->nohidden && (fname->dup.fa & FILE_ATTRIBUTE_HIDDEN)) return true; + if (fname->name_len + sizeof(struct NTFS_DE) > le16_to_cpu(e->size)) + return true; + name_len = ntfs_utf16_to_nls(sbi, fname->name, fname->name_len, name, PATH_MAX); if (name_len <= 0) { @@ -329,8 +332,7 @@ static inline bool ntfs_dir_emit(struct ntfs_sb_info *sbi, * It does additional locks/reads just to get the type of name. * Should we use additional mount option to enable branch below? */ - if (((fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT) || - fname->dup.ea_size) && + if (fname->dup.extend_data && ino != ni->mi.rno) { struct inode *inode = ntfs_iget5(sbi->sb, &e->ref, NULL); if (!IS_ERR_OR_NULL(inode)) { diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 34ed242e1063..c1ece707b195 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -57,6 +57,10 @@ long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg) struct inode *inode = file_inode(filp); struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info; + /* Avoid any operation if inode is bad. */ + if (unlikely(is_bad_ni(ntfs_i(inode)))) + return -EINVAL; + switch (cmd) { case FITRIM: return ntfs_ioctl_fitrim(sbi, arg); @@ -81,6 +85,10 @@ int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct inode *inode = d_inode(path->dentry); struct ntfs_inode *ni = ntfs_i(inode); + /* Avoid any operation if inode is bad. */ + if (unlikely(is_bad_ni(ni))) + return -EINVAL; + stat->result_mask |= STATX_BTIME; stat->btime = ni->i_crtime; stat->blksize = ni->mi.sbi->cluster_size; /* 512, 1K, ..., 2M */ @@ -154,13 +162,13 @@ static int ntfs_extend_initialized_size(struct file *file, if (pos + len > new_valid) len = new_valid - pos; - err = ntfs_write_begin(file, mapping, pos, len, &folio, NULL); + err = ntfs_write_begin(NULL, mapping, pos, len, &folio, NULL); if (err) goto out; folio_zero_range(folio, zerofrom, folio_size(folio) - zerofrom); - err = ntfs_write_end(file, mapping, pos, len, len, folio, NULL); + err = ntfs_write_end(NULL, mapping, pos, len, len, folio, NULL); if (err < 0) goto out; pos += len; @@ -261,16 +269,21 @@ out: } /* - * ntfs_file_mmap - file_operations::mmap + * ntfs_file_mmap_prepare - file_operations::mmap_prepare */ -static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma) +static int ntfs_file_mmap_prepare(struct vm_area_desc *desc) { + struct file *file = desc->file; struct inode *inode = file_inode(file); struct ntfs_inode *ni = ntfs_i(inode); - u64 from = ((u64)vma->vm_pgoff << PAGE_SHIFT); - bool rw = vma->vm_flags & VM_WRITE; + u64 from = ((u64)desc->pgoff << PAGE_SHIFT); + bool rw = desc->vm_flags & VM_WRITE; int err; + /* Avoid any operation if inode is bad. */ + if (unlikely(is_bad_ni(ni))) + return -EINVAL; + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) return -EIO; @@ -291,7 +304,7 @@ static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (rw) { u64 to = min_t(loff_t, i_size_read(inode), - from + vma->vm_end - vma->vm_start); + from + desc->end - desc->start); if (is_sparsed(ni)) { /* Allocate clusters for rw map. */ @@ -310,7 +323,10 @@ static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma) } if (ni->i_valid < to) { - inode_lock(inode); + if (!inode_trylock(inode)) { + err = -EAGAIN; + goto out; + } err = ntfs_extend_initialized_size(file, ni, ni->i_valid, to); inode_unlock(inode); @@ -319,7 +335,7 @@ static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma) } } - err = generic_file_mmap(file, vma); + err = generic_file_mmap_prepare(desc); out: return err; } @@ -735,6 +751,10 @@ int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode = inode->i_mode; int err; + /* Avoid any operation if inode is bad. */ + if (unlikely(is_bad_ni(ni))) + return -EINVAL; + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) return -EIO; @@ -795,6 +815,10 @@ static int check_read_restriction(struct inode *inode) { struct ntfs_inode *ni = ntfs_i(inode); + /* Avoid any operation if inode is bad. */ + if (unlikely(is_bad_ni(ni))) + return -EINVAL; + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) return -EIO; @@ -913,7 +937,8 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) struct ntfs_inode *ni = ntfs_i(inode); u64 valid = ni->i_valid; struct ntfs_sb_info *sbi = ni->mi.sbi; - struct page *page, **pages = NULL; + struct page **pages = NULL; + struct folio *folio; size_t written = 0; u8 frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits; u32 frame_size = 1u << frame_bits; @@ -923,7 +948,6 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) u64 frame_vbo; pgoff_t index; bool frame_uptodate; - struct folio *folio; if (frame_size < PAGE_SIZE) { /* @@ -977,8 +1001,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) pages_per_frame); if (err) { for (ip = 0; ip < pages_per_frame; ip++) { - page = pages[ip]; - folio = page_folio(page); + folio = page_folio(pages[ip]); folio_unlock(folio); folio_put(folio); } @@ -989,10 +1012,9 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) ip = off >> PAGE_SHIFT; off = offset_in_page(valid); for (; ip < pages_per_frame; ip++, off = 0) { - page = pages[ip]; - folio = page_folio(page); - zero_user_segment(page, off, PAGE_SIZE); - flush_dcache_page(page); + folio = page_folio(pages[ip]); + folio_zero_segment(folio, off, PAGE_SIZE); + flush_dcache_folio(folio); folio_mark_uptodate(folio); } @@ -1001,8 +1023,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) ni_unlock(ni); for (ip = 0; ip < pages_per_frame; ip++) { - page = pages[ip]; - folio = page_folio(page); + folio = page_folio(pages[ip]); folio_mark_uptodate(folio); folio_unlock(folio); folio_put(folio); @@ -1046,8 +1067,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) if (err) { for (ip = 0; ip < pages_per_frame; ip++) { - page = pages[ip]; - folio = page_folio(page); + folio = page_folio(pages[ip]); folio_unlock(folio); folio_put(folio); } @@ -1065,10 +1085,10 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) for (;;) { size_t cp, tail = PAGE_SIZE - off; - page = pages[ip]; - cp = copy_page_from_iter_atomic(page, off, + folio = page_folio(pages[ip]); + cp = copy_folio_from_iter_atomic(folio, off, min(tail, bytes), from); - flush_dcache_page(page); + flush_dcache_folio(folio); copied += cp; bytes -= cp; @@ -1088,9 +1108,8 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) ni_unlock(ni); for (ip = 0; ip < pages_per_frame; ip++) { - page = pages[ip]; - ClearPageDirty(page); - folio = page_folio(page); + folio = page_folio(pages[ip]); + folio_clear_dirty(folio); folio_mark_uptodate(folio); folio_unlock(folio); folio_put(folio); @@ -1135,6 +1154,10 @@ static int check_write_restriction(struct inode *inode) { struct ntfs_inode *ni = ntfs_i(inode); + /* Avoid any operation if inode is bad. */ + if (unlikely(is_bad_ni(ni))) + return -EINVAL; + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) return -EIO; @@ -1217,6 +1240,10 @@ int ntfs_file_open(struct inode *inode, struct file *file) { struct ntfs_inode *ni = ntfs_i(inode); + /* Avoid any operation if inode is bad. */ + if (unlikely(is_bad_ni(ni))) + return -EINVAL; + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) return -EIO; @@ -1286,6 +1313,10 @@ int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int err; struct ntfs_inode *ni = ntfs_i(inode); + /* Avoid any operation if inode is bad. */ + if (unlikely(is_bad_ni(ni))) + return -EINVAL; + err = fiemap_prep(inode, fieinfo, start, &len, ~FIEMAP_FLAG_XATTR); if (err) return err; @@ -1336,7 +1367,7 @@ const struct file_operations ntfs_file_operations = { #endif .splice_read = ntfs_file_splice_read, .splice_write = ntfs_file_splice_write, - .mmap = ntfs_file_mmap, + .mmap_prepare = ntfs_file_mmap_prepare, .open = ntfs_file_open, .fsync = generic_file_fsync, .fallocate = ntfs_fallocate, diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 756e1306fe6c..8f9fe1d7a690 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -3003,8 +3003,7 @@ int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, * ni_rename - Remove one name and insert new name. */ int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni, - struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *new_de, - bool *is_bad) + struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *new_de) { int err; struct NTFS_DE *de2 = NULL; @@ -3027,8 +3026,8 @@ int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni, err = ni_add_name(new_dir_ni, ni, new_de); if (!err) { err = ni_remove_name(dir_ni, ni, de, &de2, &undo); - if (err && ni_remove_name(new_dir_ni, ni, new_de, &de2, &undo)) - *is_bad = true; + WARN_ON(err && ni_remove_name(new_dir_ni, ni, new_de, &de2, + &undo)); } /* @@ -3119,11 +3118,21 @@ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup, } } - /* TODO: Fill reparse info. */ - dup->reparse = 0; - dup->ea_size = 0; + dup->extend_data = 0; - if (ni->ni_flags & NI_FLAG_EA) { + if (dup->fa & FILE_ATTRIBUTE_REPARSE_POINT) { + attr = ni_find_attr(ni, NULL, NULL, ATTR_REPARSE, NULL, 0, NULL, + NULL); + + if (attr) { + const struct REPARSE_POINT *rp; + + rp = resident_data_ex(attr, sizeof(struct REPARSE_POINT)); + /* If ATTR_REPARSE exists 'rp' can't be NULL. */ + if (rp) + dup->extend_data = rp->ReparseTag; + } + } else if (ni->ni_flags & NI_FLAG_EA) { attr = ni_find_attr(ni, attr, &le, ATTR_EA_INFO, NULL, 0, NULL, NULL); if (attr) { @@ -3132,7 +3141,7 @@ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup, info = resident_data_ex(attr, sizeof(struct EA_INFO)); /* If ATTR_EA_INFO exists 'info' can't be NULL. */ if (info) - dup->ea_size = info->size_pack; + dup->extend_data = info->size; } } @@ -3199,6 +3208,10 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) if (is_bad_inode(inode) || sb_rdonly(sb)) return 0; + /* Avoid any operation if inode is bad. */ + if (unlikely(is_bad_ni(ni))) + return -EINVAL; + if (unlikely(ntfs3_forced_shutdown(sb))) return -EIO; diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index df81f1f7330c..c7a2f191254d 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -905,9 +905,13 @@ void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait) void ntfs_bad_inode(struct inode *inode, const char *hint) { struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info; + struct ntfs_inode *ni = ntfs_i(inode); ntfs_inode_err(inode, "%s", hint); - make_bad_inode(inode); + + /* Do not call make_bad_inode()! */ + ni->ni_bad = true; + /* Avoid recursion if bad inode is $Volume. */ if (inode->i_ino != MFT_REC_VOL && !(sbi->flags & NTFS_FLAGS_LOG_REPLAYING)) { diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 0f0d27d4644a..37cbbee7fa58 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -878,6 +878,10 @@ static int ntfs_resident_writepage(struct folio *folio, struct ntfs_inode *ni = ntfs_i(inode); int ret; + /* Avoid any operation if inode is bad. */ + if (unlikely(is_bad_ni(ni))) + return -EINVAL; + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) return -EIO; @@ -896,6 +900,10 @@ static int ntfs_writepages(struct address_space *mapping, { struct inode *inode = mapping->host; + /* Avoid any operation if inode is bad. */ + if (unlikely(is_bad_ni(ntfs_i(inode)))) + return -EINVAL; + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) return -EIO; @@ -912,13 +920,17 @@ static int ntfs_get_block_write_begin(struct inode *inode, sector_t vbn, bh_result, create, GET_BLOCK_WRITE_BEGIN); } -int ntfs_write_begin(struct file *file, struct address_space *mapping, +int ntfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, u32 len, struct folio **foliop, void **fsdata) { int err; struct inode *inode = mapping->host; struct ntfs_inode *ni = ntfs_i(inode); + /* Avoid any operation if inode is bad. */ + if (unlikely(is_bad_ni(ni))) + return -EINVAL; + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) return -EIO; @@ -957,7 +969,8 @@ out: /* * ntfs_write_end - Address_space_operations::write_end. */ -int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, +int ntfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, u32 len, u32 copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; @@ -989,7 +1002,7 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, folio_unlock(folio); folio_put(folio); } else { - err = generic_write_end(file, mapping, pos, len, copied, folio, + err = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); } @@ -1062,10 +1075,10 @@ int inode_read_data(struct inode *inode, void *data, size_t bytes) * Number of bytes for REPARSE_DATA_BUFFER(IO_REPARSE_TAG_SYMLINK) * for unicode string of @uni_len length. */ -static inline u32 ntfs_reparse_bytes(u32 uni_len) +static inline u32 ntfs_reparse_bytes(u32 uni_len, bool is_absolute) { /* Header + unicode string + decorated unicode string. */ - return sizeof(short) * (2 * uni_len + 4) + + return sizeof(short) * (2 * uni_len + (is_absolute ? 4 : 0)) + offsetof(struct REPARSE_DATA_BUFFER, SymbolicLinkReparseBuffer.PathBuffer); } @@ -1078,8 +1091,11 @@ ntfs_create_reparse_buffer(struct ntfs_sb_info *sbi, const char *symname, struct REPARSE_DATA_BUFFER *rp; __le16 *rp_name; typeof(rp->SymbolicLinkReparseBuffer) *rs; + bool is_absolute; - rp = kzalloc(ntfs_reparse_bytes(2 * size + 2), GFP_NOFS); + is_absolute = (strlen(symname) > 1 && symname[1] == ':'); + + rp = kzalloc(ntfs_reparse_bytes(2 * size + 2, is_absolute), GFP_NOFS); if (!rp) return ERR_PTR(-ENOMEM); @@ -1094,7 +1110,7 @@ ntfs_create_reparse_buffer(struct ntfs_sb_info *sbi, const char *symname, goto out; /* err = the length of unicode name of symlink. */ - *nsize = ntfs_reparse_bytes(err); + *nsize = ntfs_reparse_bytes(err, is_absolute); if (*nsize > sbi->reparse.max_size) { err = -EFBIG; @@ -1114,7 +1130,7 @@ ntfs_create_reparse_buffer(struct ntfs_sb_info *sbi, const char *symname, /* PrintName + SubstituteName. */ rs->SubstituteNameOffset = cpu_to_le16(sizeof(short) * err); - rs->SubstituteNameLength = cpu_to_le16(sizeof(short) * err + 8); + rs->SubstituteNameLength = cpu_to_le16(sizeof(short) * err + (is_absolute ? 8 : 0)); rs->PrintNameLength = rs->SubstituteNameOffset; /* @@ -1122,16 +1138,18 @@ ntfs_create_reparse_buffer(struct ntfs_sb_info *sbi, const char *symname, * parse this path. * 0-absolute path 1- relative path (SYMLINK_FLAG_RELATIVE). */ - rs->Flags = 0; + rs->Flags = cpu_to_le32(is_absolute ? 0 : SYMLINK_FLAG_RELATIVE); - memmove(rp_name + err + 4, rp_name, sizeof(short) * err); + memmove(rp_name + err + (is_absolute ? 4 : 0), rp_name, sizeof(short) * err); - /* Decorate SubstituteName. */ - rp_name += err; - rp_name[0] = cpu_to_le16('\\'); - rp_name[1] = cpu_to_le16('?'); - rp_name[2] = cpu_to_le16('?'); - rp_name[3] = cpu_to_le16('\\'); + if (is_absolute) { + /* Decorate SubstituteName. */ + rp_name += err; + rp_name[0] = cpu_to_le16('\\'); + rp_name[1] = cpu_to_le16('?'); + rp_name[2] = cpu_to_le16('?'); + rp_name[3] = cpu_to_le16('\\'); + } return rp; out: @@ -1260,6 +1278,12 @@ int ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, goto out1; } + /* Avoid any operation if inode is bad. */ + if (unlikely(is_bad_ni(dir_ni))) { + err = -EINVAL; + goto out2; + } + if (unlikely(ntfs3_forced_shutdown(sb))) { err = -EIO; goto out2; @@ -1350,7 +1374,7 @@ int ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, fname->dup.a_time = std5->cr_time; fname->dup.alloc_size = fname->dup.data_size = 0; fname->dup.fa = std5->fa; - fname->dup.ea_size = fname->dup.reparse = 0; + fname->dup.extend_data = S_ISLNK(mode) ? IO_REPARSE_TAG_SYMLINK : 0; dsize = le16_to_cpu(new_de->key_size); asize = ALIGN(SIZEOF_RESIDENT + dsize, 8); @@ -1590,27 +1614,29 @@ int ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, inode->i_flags |= S_NOSEC; } - /* - * ntfs_init_acl and ntfs_save_wsl_perm update extended attribute. - * The packed size of extended attribute is stored in direntry too. - * 'fname' here points to inside new_de. - */ - err = ntfs_save_wsl_perm(inode, &fname->dup.ea_size); - if (err) - goto out6; + if (!S_ISLNK(mode)) { + /* + * ntfs_init_acl and ntfs_save_wsl_perm update extended attribute. + * The packed size of extended attribute is stored in direntry too. + * 'fname' here points to inside new_de. + */ + err = ntfs_save_wsl_perm(inode, &fname->dup.extend_data); + if (err) + goto out6; - /* - * update ea_size in file_name attribute too. - * Use ni_find_attr cause layout of MFT record may be changed - * in ntfs_init_acl and ntfs_save_wsl_perm. - */ - attr = ni_find_attr(ni, NULL, NULL, ATTR_NAME, NULL, 0, NULL, NULL); - if (attr) { - struct ATTR_FILE_NAME *fn; + /* + * update ea_size in file_name attribute too. + * Use ni_find_attr cause layout of MFT record may be changed + * in ntfs_init_acl and ntfs_save_wsl_perm. + */ + attr = ni_find_attr(ni, NULL, NULL, ATTR_NAME, NULL, 0, NULL, NULL); + if (attr) { + struct ATTR_FILE_NAME *fn; - fn = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME); - if (fn) - fn->dup.ea_size = fname->dup.ea_size; + fn = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME); + if (fn) + fn->dup.extend_data = fname->dup.extend_data; + } } /* We do not need to update parent directory later */ diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index b807744fc6a9..82c8ae56beee 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -171,6 +171,10 @@ static int ntfs_unlink(struct inode *dir, struct dentry *dentry) struct ntfs_inode *ni = ntfs_i(dir); int err; + /* Avoid any operation if inode is bad. */ + if (unlikely(is_bad_ni(ni))) + return -EINVAL; + if (unlikely(ntfs3_forced_shutdown(dir->i_sb))) return -EIO; @@ -191,6 +195,10 @@ static int ntfs_symlink(struct mnt_idmap *idmap, struct inode *dir, { u32 size = strlen(symname); + /* Avoid any operation if inode is bad. */ + if (unlikely(is_bad_ni(ntfs_i(dir)))) + return -EINVAL; + if (unlikely(ntfs3_forced_shutdown(dir->i_sb))) return -EIO; @@ -216,6 +224,10 @@ static int ntfs_rmdir(struct inode *dir, struct dentry *dentry) struct ntfs_inode *ni = ntfs_i(dir); int err; + /* Avoid any operation if inode is bad. */ + if (unlikely(is_bad_ni(ni))) + return -EINVAL; + if (unlikely(ntfs3_forced_shutdown(dir->i_sb))) return -EIO; @@ -244,7 +256,7 @@ static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir, struct ntfs_inode *ni = ntfs_i(inode); struct inode *new_inode = d_inode(new_dentry); struct NTFS_DE *de, *new_de; - bool is_same, is_bad; + bool is_same; /* * de - memory of PATH_MAX bytes: * [0-1024) - original name (dentry->d_name) @@ -256,6 +268,10 @@ static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir, 1024); static_assert(PATH_MAX >= 4 * 1024); + /* Avoid any operation if inode is bad. */ + if (unlikely(is_bad_ni(ni))) + return -EINVAL; + if (unlikely(ntfs3_forced_shutdown(sb))) return -EIO; @@ -313,12 +329,8 @@ static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir, if (dir_ni != new_dir_ni) ni_lock_dir2(new_dir_ni); - is_bad = false; - err = ni_rename(dir_ni, new_dir_ni, ni, de, new_de, &is_bad); - if (is_bad) { - /* Restore after failed rename failed too. */ - _ntfs_bad_inode(inode); - } else if (!err) { + err = ni_rename(dir_ni, new_dir_ni, ni, de, new_de); + if (!err) { simple_rename_timestamp(dir, dentry, new_dir, new_dentry); mark_inode_dirty(inode); mark_inode_dirty(dir); diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h index 1ff13b6f9613..552b97905813 100644 --- a/fs/ntfs3/ntfs.h +++ b/fs/ntfs3/ntfs.h @@ -561,8 +561,7 @@ struct NTFS_DUP_INFO { __le64 alloc_size; // 0x20: Data attribute allocated size, multiple of cluster size. __le64 data_size; // 0x28: Data attribute size <= Dataalloc_size. enum FILE_ATTRIBUTE fa; // 0x30: Standard DOS attributes & more. - __le16 ea_size; // 0x34: Packed EAs. - __le16 reparse; // 0x36: Used by Reparse. + __le32 extend_data; // 0x34: Extended data. }; // 0x38 diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 36b8052660d5..1296e6fcc779 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -377,6 +377,13 @@ struct ntfs_inode { */ u8 mi_loaded; + /* + * Use this field to avoid any write(s). + * If inode is bad during initialization - use make_bad_inode + * If inode is bad during operations - use this field + */ + u8 ni_bad; + union { struct ntfs_index dir; struct { @@ -577,8 +584,7 @@ int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, struct NTFS_DE *de); int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni, - struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *new_de, - bool *is_bad); + struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *new_de); bool ni_is_dirty(struct inode *inode); @@ -702,10 +708,12 @@ struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref, int ntfs_set_size(struct inode *inode, u64 new_size); int ntfs_get_block(struct inode *inode, sector_t vbn, struct buffer_head *bh_result, int create); -int ntfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, u32 len, struct folio **foliop, void **fsdata); -int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, - u32 len, u32 copied, struct folio *folio, void *fsdata); +int ntfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, u32 len, struct folio **foliop, + void **fsdata); +int ntfs_write_end(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, u32 len, u32 copied, struct folio *folio, + void *fsdata); int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc); int ntfs_sync_inode(struct inode *inode); int inode_read_data(struct inode *inode, void *data, size_t bytes); @@ -874,7 +882,7 @@ int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry); ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size); extern const struct xattr_handler *const ntfs_xattr_handlers[]; -int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size); +int ntfs_save_wsl_perm(struct inode *inode, __le32 *ea_size); void ntfs_get_wsl_perm(struct inode *inode); /* globals from lznt.c */ @@ -1025,6 +1033,11 @@ static inline bool is_compressed(const struct ntfs_inode *ni) (ni->ni_flags & NI_FLAG_COMPRESSED_MASK); } +static inline bool is_bad_ni(const struct ntfs_inode *ni) +{ + return ni->ni_bad; +} + static inline int ni_ext_compress_bits(const struct ntfs_inode *ni) { return 0xb + (ni->ni_flags & NI_FLAG_COMPRESSED_MASK); diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 920a1ab47b63..ddff94c091b8 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -1223,7 +1223,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_export_op = &ntfs_export_ops; sb->s_time_gran = NTFS_TIME_GRAN; // 100 nsec sb->s_xattr = ntfs_xattr_handlers; - sb->s_d_op = options->nocase ? &ntfs_dentry_ops : NULL; + if (options->nocase) + set_default_d_op(sb, &ntfs_dentry_ops); options->nls = ntfs_load_nls(options->nls_name); if (IS_ERR(options->nls)) { diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index e0055dcf8fe3..e519e21596a7 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -313,7 +313,7 @@ out: static noinline int ntfs_set_ea(struct inode *inode, const char *name, size_t name_len, const void *value, size_t val_size, int flags, bool locked, - __le16 *ea_size) + __le32 *ea_size) { struct ntfs_inode *ni = ntfs_i(inode); struct ntfs_sb_info *sbi = ni->mi.sbi; @@ -522,7 +522,7 @@ update_ea: if (ea_info.size_pack != size_pack) ni->ni_flags |= NI_FLAG_UPDATE_PARENT; if (ea_size) - *ea_size = ea_info.size_pack; + *ea_size = ea_info.size; mark_inode_dirty(&ni->vfs_inode); out: @@ -552,6 +552,10 @@ struct posix_acl *ntfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int err; void *buf; + /* Avoid any operation if inode is bad. */ + if (unlikely(is_bad_ni(ni))) + return ERR_PTR(-EINVAL); + /* Allocate PATH_MAX bytes. */ buf = __getname(); if (!buf) @@ -600,6 +604,10 @@ static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap, int flags; umode_t mode; + /* Avoid any operation if inode is bad. */ + if (unlikely(is_bad_ni(ntfs_i(inode)))) + return -EINVAL; + if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; @@ -730,6 +738,10 @@ ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size) struct ntfs_inode *ni = ntfs_i(inode); ssize_t ret; + /* Avoid any operation if inode is bad. */ + if (unlikely(is_bad_ni(ni))) + return -EINVAL; + if (!(ni->ni_flags & NI_FLAG_EA)) { /* no xattr in file */ return 0; @@ -751,6 +763,10 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de, int err; struct ntfs_inode *ni = ntfs_i(inode); + /* Avoid any operation if inode is bad. */ + if (unlikely(is_bad_ni(ni))) + return -EINVAL; + if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) return -EIO; @@ -950,7 +966,7 @@ out: * * save uid/gid/mode in xattr */ -int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size) +int ntfs_save_wsl_perm(struct inode *inode, __le32 *ea_size) { int err; __le32 value; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 40b6bce12951..2203438738f6 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1856,7 +1856,8 @@ out: return ret; } -static int ocfs2_write_begin(struct file *file, struct address_space *mapping, +static int ocfs2_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -2047,7 +2048,8 @@ out: return copied; } -static int ocfs2_write_end(struct file *file, struct address_space *mapping, +static int ocfs2_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index fce9beb214f0..b05d4e9d13b2 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1483,12 +1483,13 @@ static void o2net_sc_send_keep_req(struct work_struct *work) sc_put(sc); } -/* socket shutdown does a del_timer_sync against this as it tears down. +/* socket shutdown does a timer_delete_sync against this as it tears down. * we can't start this timer until we've got to the point in sc buildup * where shutdown is going to be involved */ static void o2net_idle_timer(struct timer_list *t) { - struct o2net_sock_container *sc = from_timer(sc, t, sc_idle_timeout); + struct o2net_sock_container *sc = timer_container_of(sc, t, + sc_idle_timeout); struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); #ifdef CONFIG_DEBUG_FS unsigned long msecs = ktime_to_ms(ktime_get()) - diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 2056cf08ac1e..21d797ccccd0 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2800,7 +2800,7 @@ const struct inode_operations ocfs2_special_file_iops = { */ const struct file_operations ocfs2_fops = { .llseek = ocfs2_file_llseek, - .mmap = ocfs2_mmap, + .mmap_prepare = ocfs2_mmap_prepare, .fsync = ocfs2_sync_file, .release = ocfs2_file_release, .open = ocfs2_file_open, @@ -2850,7 +2850,7 @@ const struct file_operations ocfs2_dops = { */ const struct file_operations ocfs2_fops_no_plocks = { .llseek = ocfs2_file_llseek, - .mmap = ocfs2_mmap, + .mmap_prepare = ocfs2_mmap_prepare, .fsync = ocfs2_sync_file, .release = ocfs2_file_release, .open = ocfs2_file_open, diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index 1ad7106741f8..3ad7baf67658 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -505,5 +505,5 @@ static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj, ocfs2_filecheck_handle_entry(ent, entry); exit: - return (!ret ? count : ret); + return ret ?: count; } diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 7ae96fb8807a..db14c92302a1 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -62,7 +62,7 @@ static inline int o2info_coherent(struct ocfs2_info_request *req) return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT)); } -int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int ocfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); unsigned int flags; @@ -83,7 +83,7 @@ int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa) } int ocfs2_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); unsigned int flags = fa->flags; diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h index 48a5fdfe87a1..4a1c2313b429 100644 --- a/fs/ocfs2/ioctl.h +++ b/fs/ocfs2/ioctl.h @@ -11,9 +11,9 @@ #ifndef OCFS2_IOCTL_PROTO_H #define OCFS2_IOCTL_PROTO_H -int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int ocfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa); int ocfs2_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 6a314e9f2b49..50e2faf64c19 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -159,8 +159,9 @@ static const struct vm_operations_struct ocfs2_file_vm_ops = { .page_mkwrite = ocfs2_page_mkwrite, }; -int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) +int ocfs2_mmap_prepare(struct vm_area_desc *desc) { + struct file *file = desc->file; int ret = 0, lock_level = 0; ret = ocfs2_inode_lock_atime(file_inode(file), @@ -171,7 +172,7 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) } ocfs2_inode_unlock(file_inode(file), lock_level); out: - vma->vm_ops = &ocfs2_file_vm_ops; + desc->vm_ops = &ocfs2_file_vm_ops; return 0; } diff --git a/fs/ocfs2/mmap.h b/fs/ocfs2/mmap.h index 1051507cc684..d21c30de6b8c 100644 --- a/fs/ocfs2/mmap.h +++ b/fs/ocfs2/mmap.h @@ -2,6 +2,6 @@ #ifndef OCFS2_MMAP_H #define OCFS2_MMAP_H -int ocfs2_mmap(struct file *file, struct vm_area_struct *vma); +int ocfs2_mmap_prepare(struct vm_area_desc *desc); #endif /* OCFS2_MMAP_H */ diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index e272429da3db..de7f12858729 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -674,7 +674,7 @@ out_put: break; } out: - kfree(rec); + ocfs2_free_quota_recovery(rec); return status; } diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index ddd761cf44c8..a28c127b9934 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -691,8 +691,7 @@ static void __exit ocfs2_stack_glue_exit(void) memset(&locking_max_version, 0, sizeof(struct ocfs2_protocol_version)); ocfs2_sysfs_exit(); - if (ocfs2_table_header) - unregister_sysctl_table(ocfs2_table_header); + unregister_sysctl_table(ocfs2_table_header); } MODULE_AUTHOR("Oracle"); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 3d2533950bae..53daa4482406 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1962,7 +1962,7 @@ static int ocfs2_initialize_super(struct super_block *sb, sb->s_fs_info = osb; sb->s_op = &ocfs2_sops; - sb->s_d_op = &ocfs2_dentry_ops; + set_default_d_op(sb, &ocfs2_dentry_ops); sb->s_export_op = &ocfs2_export_ops; sb->s_qcop = &dquot_quotactl_sysfile_ops; sb->dq_op = &ocfs2_quota_operations; diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 98358d405b6a..49a1de5a827f 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -310,9 +310,10 @@ static void omfs_write_failed(struct address_space *mapping, loff_t to) } } -static int omfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int omfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; @@ -332,7 +333,7 @@ const struct file_operations omfs_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, - .mmap = generic_file_mmap, + .mmap_prepare = generic_file_mmap_prepare, .fsync = generic_file_fsync, .splice_read = filemap_splice_read, }; diff --git a/fs/open.c b/fs/open.c index 7828234a7caa..9655158c3885 100644 --- a/fs/open.c +++ b/fs/open.c @@ -281,6 +281,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) break; case FALLOC_FL_COLLAPSE_RANGE: case FALLOC_FL_INSERT_RANGE: + case FALLOC_FL_WRITE_ZEROES: if (mode & FALLOC_FL_KEEP_SIZE) return -EOPNOTSUPP; break; @@ -943,12 +944,12 @@ static int do_dentry_open(struct file *f, goto cleanup_all; /* - * Set FMODE_NONOTIFY_* bits according to existing permission watches. + * Call fsnotify open permission hook and set FMODE_NONOTIFY_* bits + * according to existing permission watches. * If FMODE_NONOTIFY mode was already set for an fanotify fd or for a * pseudo file, this call will not change the mode. */ - file_set_fsnotify_mode_from_watchers(f); - error = fsnotify_open_perm(f); + error = fsnotify_open_perm_and_set_mode(f); if (error) goto cleanup_all; @@ -1204,14 +1205,11 @@ struct file *kernel_file_open(const struct path *path, int flags, if (IS_ERR(f)) return f; - f->f_path = *path; - error = do_dentry_open(f, NULL); + error = vfs_open(path, f); if (error) { fput(f); return ERR_PTR(error); } - - fsnotify_open(f); return f; } EXPORT_SYMBOL_GPL(kernel_file_open); diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 90c49c0de243..919f99b16834 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -398,8 +398,9 @@ static const struct vm_operations_struct orangefs_file_vm_ops = { /* * Memory map a region of a file. */ -static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) +static int orangefs_file_mmap_prepare(struct vm_area_desc *desc) { + struct file *file = desc->file; int ret; ret = orangefs_revalidate_mapping(file_inode(file)); @@ -410,10 +411,11 @@ static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) "orangefs_file_mmap: called on %pD\n", file); /* set the sequential readahead hint */ - vm_flags_mod(vma, VM_SEQ_READ, VM_RAND_READ); + desc->vm_flags |= VM_SEQ_READ; + desc->vm_flags &= ~VM_RAND_READ; file_accessed(file); - vma->vm_ops = &orangefs_file_vm_ops; + desc->vm_ops = &orangefs_file_vm_ops; return 0; } @@ -574,7 +576,7 @@ const struct file_operations orangefs_file_operations = { .read_iter = orangefs_file_read_iter, .write_iter = orangefs_file_write_iter, .lock = orangefs_lock, - .mmap = orangefs_file_mmap, + .mmap_prepare = orangefs_file_mmap_prepare, .open = generic_file_open, .splice_read = orangefs_file_splice_read, .splice_write = iter_file_splice_write, diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 08a6f372a352..a01400cd41fd 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -285,9 +285,10 @@ static int orangefs_read_folio(struct file *file, struct folio *folio) return ret; } -static int orangefs_write_begin(struct file *file, - struct address_space *mapping, loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int orangefs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, + unsigned len, struct folio **foliop, + void **fsdata) { struct orangefs_write_range *wr; struct folio *folio; @@ -340,9 +341,10 @@ okay: return 0; } -static int orangefs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, struct folio *folio, - void *fsdata) +static int orangefs_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = folio->mapping->host; loff_t last_pos = pos + copied; @@ -372,7 +374,7 @@ static int orangefs_write_end(struct file *file, struct address_space *mapping, folio_unlock(folio); folio_put(folio); - mark_inode_dirty_sync(file_inode(file)); + mark_inode_dirty_sync(file_inode(iocb->ki_filp)); return copied; } @@ -887,7 +889,7 @@ int orangefs_update_time(struct inode *inode, int flags) return __orangefs_setattr(inode, &iattr); } -static int orangefs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +static int orangefs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { u64 val = 0; int ret; @@ -908,7 +910,7 @@ static int orangefs_fileattr_get(struct dentry *dentry, struct fileattr *fa) } static int orangefs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { u64 val = 0; diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index f7095c91660c..1c375fb65018 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -396,7 +396,7 @@ static ssize_t orangefs_debug_read(struct file *file, goto out; mutex_lock(&orangefs_debug_lock); - sprintf_ret = sprintf(buf, "%s", (char *)file->private_data); + sprintf_ret = scnprintf(buf, ORANGEFS_MAX_DEBUG_STRING_LEN, "%s", (char *)file->private_data); mutex_unlock(&orangefs_debug_lock); read_ret = simple_read_from_buffer(ubuf, count, ppos, buf, sprintf_ret); @@ -769,8 +769,8 @@ static void do_k_string(void *k_mask, int index) if (*mask & s_kmod_keyword_mask_map[index].mask_val) { if ((strlen(kernel_debug_string) + - strlen(s_kmod_keyword_mask_map[index].keyword)) - < ORANGEFS_MAX_DEBUG_STRING_LEN - 1) { + strlen(s_kmod_keyword_mask_map[index].keyword) + 1) + < ORANGEFS_MAX_DEBUG_STRING_LEN) { strcat(kernel_debug_string, s_kmod_keyword_mask_map[index].keyword); strcat(kernel_debug_string, ","); @@ -797,7 +797,7 @@ static void do_c_string(void *c_mask, int index) (mask->mask2 & cdm_array[index].mask2)) { if ((strlen(client_debug_string) + strlen(cdm_array[index].keyword) + 1) - < ORANGEFS_MAX_DEBUG_STRING_LEN - 2) { + < ORANGEFS_MAX_DEBUG_STRING_LEN) { strcat(client_debug_string, cdm_array[index].keyword); strcat(client_debug_string, ","); diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c index 04e15dfa504a..369455b354ef 100644 --- a/fs/orangefs/orangefs-sysfs.c +++ b/fs/orangefs/orangefs-sysfs.c @@ -217,36 +217,31 @@ static ssize_t sysfs_int_show(struct kobject *kobj, if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) { if (!strcmp(attr->attr.name, "op_timeout_secs")) { - rc = scnprintf(buf, - PAGE_SIZE, + rc = sysfs_emit(buf, "%d\n", op_timeout_secs); goto out; } else if (!strcmp(attr->attr.name, "slot_timeout_secs")) { - rc = scnprintf(buf, - PAGE_SIZE, + rc = sysfs_emit(buf, "%d\n", slot_timeout_secs); goto out; } else if (!strcmp(attr->attr.name, "cache_timeout_msecs")) { - rc = scnprintf(buf, - PAGE_SIZE, + rc = sysfs_emit(buf, "%d\n", orangefs_cache_timeout_msecs); goto out; } else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) { - rc = scnprintf(buf, - PAGE_SIZE, + rc = sysfs_emit(buf, "%d\n", orangefs_dcache_timeout_msecs); goto out; } else if (!strcmp(attr->attr.name, "getattr_timeout_msecs")) { - rc = scnprintf(buf, - PAGE_SIZE, + rc = sysfs_emit(buf, "%d\n", orangefs_getattr_timeout_msecs); goto out; @@ -256,14 +251,12 @@ static ssize_t sysfs_int_show(struct kobject *kobj, } else if (!strcmp(kobj->name, STATS_KOBJ_ID)) { if (!strcmp(attr->attr.name, "reads")) { - rc = scnprintf(buf, - PAGE_SIZE, + rc = sysfs_emit(buf, "%lu\n", orangefs_stats.reads); goto out; } else if (!strcmp(attr->attr.name, "writes")) { - rc = scnprintf(buf, - PAGE_SIZE, + rc = sysfs_emit(buf, "%lu\n", orangefs_stats.writes); goto out; @@ -497,19 +490,18 @@ out: if (strcmp(kobj->name, PC_KOBJ_ID)) { if (new_op->upcall.req.param.op == ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE) { - rc = scnprintf(buf, PAGE_SIZE, "%d %d\n", + rc = sysfs_emit(buf, "%d %d\n", (int)new_op->downcall.resp.param.u. value32[0], (int)new_op->downcall.resp.param.u. value32[1]); } else { - rc = scnprintf(buf, PAGE_SIZE, "%d\n", + rc = sysfs_emit(buf, "%d\n", (int)new_op->downcall.resp.param.u.value64); } } else { - rc = scnprintf( + rc = sysfs_emit( buf, - PAGE_SIZE, "%s", new_op->downcall.resp.perf_count.buffer); } diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index 64ca9498f550..f3da840758e7 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -416,7 +416,7 @@ static int orangefs_fill_sb(struct super_block *sb, sb->s_xattr = orangefs_xattr_handlers; sb->s_magic = ORANGEFS_SUPER_MAGIC; sb->s_op = &orangefs_s_ops; - sb->s_d_op = &orangefs_dentry_operations; + set_default_d_op(sb, &orangefs_dentry_operations); sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index d7310fcf3888..27396fe63f6d 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -171,14 +171,14 @@ out: static int ovl_copy_fileattr(struct inode *inode, const struct path *old, const struct path *new) { - struct fileattr oldfa = { .flags_valid = true }; - struct fileattr newfa = { .flags_valid = true }; + struct file_kattr oldfa = { .flags_valid = true }; + struct file_kattr newfa = { .flags_valid = true }; int err; err = ovl_real_fileattr_get(old, &oldfa); if (err) { /* Ntfs-3g returns -EINVAL for "no fileattr support" */ - if (err == -ENOTTY || err == -EINVAL) + if (err == -EOPNOTSUPP || err == -EINVAL) return 0; pr_warn("failed to retrieve lower fileattr (%pd2, err=%i)\n", old->dentry, err); @@ -517,15 +517,12 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper, /* * Create and install index entry. - * - * Caller must hold i_mutex on indexdir. */ static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh, struct dentry *upper) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *indexdir = ovl_indexdir(dentry->d_sb); - struct inode *dir = d_inode(indexdir); struct dentry *index = NULL; struct dentry *temp = NULL; struct qstr name = { }; @@ -559,16 +556,20 @@ static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh, if (err) goto out; + err = ovl_parent_lock(indexdir, temp); + if (err) + goto out; index = ovl_lookup_upper(ofs, name.name, indexdir, name.len); if (IS_ERR(index)) { err = PTR_ERR(index); } else { - err = ovl_do_rename(ofs, dir, temp, dir, index, 0); + err = ovl_do_rename(ofs, indexdir, temp, indexdir, index, 0); dput(index); } + ovl_parent_unlock(indexdir); out: if (err) - ovl_cleanup(ofs, dir, temp); + ovl_cleanup(ofs, indexdir, temp); dput(temp); free_name: kfree(name.name); @@ -762,7 +763,6 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) { struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *inode; - struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir); struct path path = { .mnt = ovl_upper_mnt(ofs) }; struct dentry *temp, *upper, *trap; struct ovl_cu_creds cc; @@ -779,9 +779,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) return err; ovl_start_write(c->dentry); - inode_lock(wdir); temp = ovl_create_temp(ofs, c->workdir, &cattr); - inode_unlock(wdir); ovl_end_write(c->dentry); ovl_revert_cu_creds(&cc); @@ -794,45 +792,47 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) */ path.dentry = temp; err = ovl_copy_up_data(c, &path); + ovl_start_write(c->dentry); + if (err) + goto cleanup_unlocked; + + if (S_ISDIR(c->stat.mode) && c->indexed) { + err = ovl_create_index(c->dentry, c->origin_fh, temp); + if (err) + goto cleanup_unlocked; + } + /* * We cannot hold lock_rename() throughout this helper, because of * lock ordering with sb_writers, which shouldn't be held when calling * ovl_copy_up_data(), so lock workdir and destdir and make sure that * temp wasn't moved before copy up completion or cleanup. */ - ovl_start_write(c->dentry); trap = lock_rename(c->workdir, c->destdir); if (trap || temp->d_parent != c->workdir) { /* temp or workdir moved underneath us? abort without cleanup */ dput(temp); err = -EIO; - if (IS_ERR(trap)) - goto out; - goto unlock; - } else if (err) { - goto cleanup; + if (!IS_ERR(trap)) + unlock_rename(c->workdir, c->destdir); + goto out; } err = ovl_copy_up_metadata(c, temp); if (err) goto cleanup; - if (S_ISDIR(c->stat.mode) && c->indexed) { - err = ovl_create_index(c->dentry, c->origin_fh, temp); - if (err) - goto cleanup; - } - upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir, c->destname.len); err = PTR_ERR(upper); if (IS_ERR(upper)) goto cleanup; - err = ovl_do_rename(ofs, wdir, temp, udir, upper, 0); + err = ovl_do_rename(ofs, c->workdir, temp, c->destdir, upper, 0); + unlock_rename(c->workdir, c->destdir); dput(upper); if (err) - goto cleanup; + goto cleanup_unlocked; inode = d_inode(c->dentry); if (c->metacopy_digest) @@ -846,17 +846,17 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) ovl_inode_update(inode, temp); if (S_ISDIR(inode->i_mode)) ovl_set_flag(OVL_WHITEOUTS, inode); -unlock: - unlock_rename(c->workdir, c->destdir); out: ovl_end_write(c->dentry); return err; cleanup: - ovl_cleanup(ofs, wdir, temp); + unlock_rename(c->workdir, c->destdir); +cleanup_unlocked: + ovl_cleanup(ofs, c->workdir, temp); dput(temp); - goto unlock; + goto out; } /* Copyup using O_TMPFILE which does not require cross dir locking */ diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index fe493f3ed6b6..70b8687dc45e 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -24,7 +24,8 @@ MODULE_PARM_DESC(redirect_max, static int ovl_set_redirect(struct dentry *dentry, bool samedir); -int ovl_cleanup(struct ovl_fs *ofs, struct inode *wdir, struct dentry *wdentry) +static int ovl_cleanup_locked(struct ovl_fs *ofs, struct inode *wdir, + struct dentry *wdentry) { int err; @@ -43,6 +44,21 @@ int ovl_cleanup(struct ovl_fs *ofs, struct inode *wdir, struct dentry *wdentry) return err; } +int ovl_cleanup(struct ovl_fs *ofs, struct dentry *workdir, + struct dentry *wdentry) +{ + int err; + + err = ovl_parent_lock(workdir, wdentry); + if (err) + return err; + + ovl_cleanup_locked(ofs, workdir->d_inode, wdentry); + ovl_parent_unlock(workdir); + + return 0; +} + struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir) { struct dentry *temp; @@ -62,7 +78,6 @@ struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir) return temp; } -/* caller holds i_mutex on workdir */ static struct dentry *ovl_whiteout(struct ovl_fs *ofs) { int err; @@ -70,47 +85,52 @@ static struct dentry *ovl_whiteout(struct ovl_fs *ofs) struct dentry *workdir = ofs->workdir; struct inode *wdir = workdir->d_inode; + guard(mutex)(&ofs->whiteout_lock); + if (!ofs->whiteout) { + inode_lock_nested(wdir, I_MUTEX_PARENT); whiteout = ovl_lookup_temp(ofs, workdir); - if (IS_ERR(whiteout)) - goto out; - - err = ovl_do_whiteout(ofs, wdir, whiteout); - if (err) { - dput(whiteout); - whiteout = ERR_PTR(err); - goto out; + if (!IS_ERR(whiteout)) { + err = ovl_do_whiteout(ofs, wdir, whiteout); + if (err) { + dput(whiteout); + whiteout = ERR_PTR(err); + } } + inode_unlock(wdir); + if (IS_ERR(whiteout)) + return whiteout; ofs->whiteout = whiteout; } if (!ofs->no_shared_whiteout) { + inode_lock_nested(wdir, I_MUTEX_PARENT); whiteout = ovl_lookup_temp(ofs, workdir); - if (IS_ERR(whiteout)) - goto out; - - err = ovl_do_link(ofs, ofs->whiteout, wdir, whiteout); - if (!err) - goto out; - - if (err != -EMLINK) { - pr_warn("Failed to link whiteout - disabling whiteout inode sharing(nlink=%u, err=%i)\n", - ofs->whiteout->d_inode->i_nlink, err); + if (!IS_ERR(whiteout)) { + err = ovl_do_link(ofs, ofs->whiteout, wdir, whiteout); + if (err) { + dput(whiteout); + whiteout = ERR_PTR(err); + } + } + inode_unlock(wdir); + if (!IS_ERR(whiteout)) + return whiteout; + if (PTR_ERR(whiteout) != -EMLINK) { + pr_warn("Failed to link whiteout - disabling whiteout inode sharing(nlink=%u, err=%lu)\n", + ofs->whiteout->d_inode->i_nlink, + PTR_ERR(whiteout)); ofs->no_shared_whiteout = true; } - dput(whiteout); } whiteout = ofs->whiteout; ofs->whiteout = NULL; -out: return whiteout; } -/* Caller must hold i_mutex on both workdir and dir */ -int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir, +int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct dentry *dir, struct dentry *dentry) { - struct inode *wdir = ofs->workdir->d_inode; struct dentry *whiteout; int err; int flags = 0; @@ -123,24 +143,29 @@ int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir, if (d_is_dir(dentry)) flags = RENAME_EXCHANGE; - err = ovl_do_rename(ofs, wdir, whiteout, dir, dentry, flags); + err = ovl_lock_rename_workdir(ofs->workdir, whiteout, dir, dentry); + if (!err) { + err = ovl_do_rename(ofs, ofs->workdir, whiteout, dir, dentry, flags); + unlock_rename(ofs->workdir, dir); + } if (err) goto kill_whiteout; if (flags) - ovl_cleanup(ofs, wdir, dentry); + ovl_cleanup(ofs, ofs->workdir, dentry); out: dput(whiteout); return err; kill_whiteout: - ovl_cleanup(ofs, wdir, whiteout); + ovl_cleanup(ofs, ofs->workdir, whiteout); goto out; } -struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir, +struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent, struct dentry *newdentry, struct ovl_cattr *attr) { + struct inode *dir = parent->d_inode; int err; if (IS_ERR(newdentry)) @@ -199,8 +224,12 @@ out: struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, struct ovl_cattr *attr) { - return ovl_create_real(ofs, d_inode(workdir), - ovl_lookup_temp(ofs, workdir), attr); + struct dentry *ret; + inode_lock(workdir->d_inode); + ret = ovl_create_real(ofs, workdir, + ovl_lookup_temp(ofs, workdir), attr); + inode_unlock(workdir->d_inode); + return ret; } static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper, @@ -303,13 +332,13 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, int err; inode_lock_nested(udir, I_MUTEX_PARENT); - newdentry = ovl_create_real(ofs, udir, + newdentry = ovl_create_real(ofs, upperdir, ovl_lookup_upper(ofs, dentry->d_name.name, upperdir, dentry->d_name.len), attr); - err = PTR_ERR(newdentry); + inode_unlock(udir); if (IS_ERR(newdentry)) - goto out_unlock; + return PTR_ERR(newdentry); if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry) && !ovl_allow_offline_changes(ofs)) { @@ -321,14 +350,12 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, err = ovl_instantiate(dentry, inode, newdentry, !!attr->hardlink, NULL); if (err) goto out_cleanup; -out_unlock: - inode_unlock(udir); - return err; + return 0; out_cleanup: - ovl_cleanup(ofs, udir, newdentry); + ovl_cleanup(ofs, upperdir, newdentry); dput(newdentry); - goto out_unlock; + return err; } static struct dentry *ovl_clear_empty(struct dentry *dentry, @@ -336,9 +363,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *workdir = ovl_workdir(dentry); - struct inode *wdir = workdir->d_inode; struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); - struct inode *udir = upperdir->d_inode; struct path upperpath; struct dentry *upper; struct dentry *opaquedir; @@ -348,27 +373,25 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (WARN_ON(!workdir)) return ERR_PTR(-EROFS); - err = ovl_lock_rename_workdir(workdir, upperdir); - if (err) - goto out; - ovl_path_upper(dentry, &upperpath); err = vfs_getattr(&upperpath, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); if (err) - goto out_unlock; + goto out; err = -ESTALE; if (!S_ISDIR(stat.mode)) - goto out_unlock; + goto out; upper = upperpath.dentry; - if (upper->d_parent->d_inode != udir) - goto out_unlock; opaquedir = ovl_create_temp(ofs, workdir, OVL_CATTR(stat.mode)); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) - goto out_unlock; + goto out; + + err = ovl_lock_rename_workdir(workdir, opaquedir, upperdir, upper); + if (err) + goto out_cleanup_unlocked; err = ovl_copy_xattr(dentry->d_sb, &upperpath, opaquedir); if (err) @@ -384,13 +407,13 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (err) goto out_cleanup; - err = ovl_do_rename(ofs, wdir, opaquedir, udir, upper, RENAME_EXCHANGE); + err = ovl_do_rename(ofs, workdir, opaquedir, upperdir, upper, RENAME_EXCHANGE); + unlock_rename(workdir, upperdir); if (err) - goto out_cleanup; + goto out_cleanup_unlocked; ovl_cleanup_whiteouts(ofs, upper, list); - ovl_cleanup(ofs, wdir, upper); - unlock_rename(workdir, upperdir); + ovl_cleanup(ofs, workdir, upper); /* dentry's upper doesn't match now, get rid of it */ d_drop(dentry); @@ -398,10 +421,10 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, return opaquedir; out_cleanup: - ovl_cleanup(ofs, wdir, opaquedir); - dput(opaquedir); -out_unlock: unlock_rename(workdir, upperdir); +out_cleanup_unlocked: + ovl_cleanup(ofs, workdir, opaquedir); + dput(opaquedir); out: return ERR_PTR(err); } @@ -420,9 +443,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *workdir = ovl_workdir(dentry); - struct inode *wdir = workdir->d_inode; struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); - struct inode *udir = upperdir->d_inode; struct dentry *upper; struct dentry *newdentry; int err; @@ -439,15 +460,11 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, return err; } - err = ovl_lock_rename_workdir(workdir, upperdir); - if (err) - goto out; - - upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir, - dentry->d_name.len); + upper = ovl_lookup_upper_unlocked(ofs, dentry->d_name.name, upperdir, + dentry->d_name.len); err = PTR_ERR(upper); if (IS_ERR(upper)) - goto out_unlock; + goto out; err = -ESTALE; if (d_is_negative(upper) || !ovl_upper_is_whiteout(ofs, upper)) @@ -458,6 +475,10 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (IS_ERR(newdentry)) goto out_dput; + err = ovl_lock_rename_workdir(workdir, newdentry, upperdir, upper); + if (err) + goto out_cleanup_unlocked; + /* * mode could have been mutilated due to umask (e.g. sgid directory) */ @@ -491,27 +512,27 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (err) goto out_cleanup; - err = ovl_do_rename(ofs, wdir, newdentry, udir, upper, + err = ovl_do_rename(ofs, workdir, newdentry, upperdir, upper, RENAME_EXCHANGE); + unlock_rename(workdir, upperdir); if (err) - goto out_cleanup; + goto out_cleanup_unlocked; - ovl_cleanup(ofs, wdir, upper); + ovl_cleanup(ofs, workdir, upper); } else { - err = ovl_do_rename(ofs, wdir, newdentry, udir, upper, 0); + err = ovl_do_rename(ofs, workdir, newdentry, upperdir, upper, 0); + unlock_rename(workdir, upperdir); if (err) - goto out_cleanup; + goto out_cleanup_unlocked; } ovl_dir_modified(dentry->d_parent, false); err = ovl_instantiate(dentry, inode, newdentry, hardlink, NULL); if (err) { - ovl_cleanup(ofs, udir, newdentry); + ovl_cleanup(ofs, upperdir, newdentry); dput(newdentry); } out_dput: dput(upper); -out_unlock: - unlock_rename(workdir, upperdir); out: if (!hardlink) { posix_acl_release(acl); @@ -520,7 +541,9 @@ out: return err; out_cleanup: - ovl_cleanup(ofs, wdir, newdentry); + unlock_rename(workdir, upperdir); +out_cleanup_unlocked: + ovl_cleanup(ofs, workdir, newdentry); dput(newdentry); goto out_dput; } @@ -757,15 +780,11 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, goto out; } - err = ovl_lock_rename_workdir(workdir, upperdir); - if (err) - goto out_dput; - - upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir, - dentry->d_name.len); + upper = ovl_lookup_upper_unlocked(ofs, dentry->d_name.name, upperdir, + dentry->d_name.len); err = PTR_ERR(upper); if (IS_ERR(upper)) - goto out_unlock; + goto out_dput; err = -ESTALE; if ((opaquedir && upper != opaquedir) || @@ -774,17 +793,13 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, goto out_dput_upper; } - err = ovl_cleanup_and_whiteout(ofs, d_inode(upperdir), upper); - if (err) - goto out_d_drop; + err = ovl_cleanup_and_whiteout(ofs, upperdir, upper); + if (!err) + ovl_dir_modified(dentry->d_parent, true); - ovl_dir_modified(dentry->d_parent, true); -out_d_drop: d_drop(dentry); out_dput_upper: dput(upper); -out_unlock: - unlock_rename(workdir, upperdir); out_dput: dput(opaquedir); out: @@ -1069,9 +1084,9 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, int err; struct dentry *old_upperdir; struct dentry *new_upperdir; - struct dentry *olddentry; - struct dentry *newdentry; - struct dentry *trap; + struct dentry *olddentry = NULL; + struct dentry *newdentry = NULL; + struct dentry *trap, *de; bool old_opaque; bool new_opaque; bool cleanup_whiteout = false; @@ -1184,21 +1199,23 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, goto out_revert_creds; } - olddentry = ovl_lookup_upper(ofs, old->d_name.name, old_upperdir, - old->d_name.len); - err = PTR_ERR(olddentry); - if (IS_ERR(olddentry)) + de = ovl_lookup_upper(ofs, old->d_name.name, old_upperdir, + old->d_name.len); + err = PTR_ERR(de); + if (IS_ERR(de)) goto out_unlock; + olddentry = de; err = -ESTALE; if (!ovl_matches_upper(old, olddentry)) - goto out_dput_old; + goto out_unlock; - newdentry = ovl_lookup_upper(ofs, new->d_name.name, new_upperdir, - new->d_name.len); - err = PTR_ERR(newdentry); - if (IS_ERR(newdentry)) - goto out_dput_old; + de = ovl_lookup_upper(ofs, new->d_name.name, new_upperdir, + new->d_name.len); + err = PTR_ERR(de); + if (IS_ERR(de)) + goto out_unlock; + newdentry = de; old_opaque = ovl_dentry_is_opaque(old); new_opaque = ovl_dentry_is_opaque(new); @@ -1207,28 +1224,28 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, if (d_inode(new) && ovl_dentry_upper(new)) { if (opaquedir) { if (newdentry != opaquedir) - goto out_dput; + goto out_unlock; } else { if (!ovl_matches_upper(new, newdentry)) - goto out_dput; + goto out_unlock; } } else { if (!d_is_negative(newdentry)) { if (!new_opaque || !ovl_upper_is_whiteout(ofs, newdentry)) - goto out_dput; + goto out_unlock; } else { if (flags & RENAME_EXCHANGE) - goto out_dput; + goto out_unlock; } } if (olddentry == trap) - goto out_dput; + goto out_unlock; if (newdentry == trap) - goto out_dput; + goto out_unlock; if (olddentry->d_inode == newdentry->d_inode) - goto out_dput; + goto out_unlock; err = 0; if (ovl_type_merge_or_lower(old)) @@ -1236,7 +1253,7 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, else if (is_dir && !old_opaque && ovl_type_merge(new->d_parent)) err = ovl_set_opaque_xerr(old, olddentry, -EXDEV); if (err) - goto out_dput; + goto out_unlock; if (!overwrite && ovl_type_merge_or_lower(new)) err = ovl_set_redirect(new, samedir); @@ -1244,15 +1261,16 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, ovl_type_merge(old->d_parent)) err = ovl_set_opaque_xerr(new, newdentry, -EXDEV); if (err) - goto out_dput; + goto out_unlock; - err = ovl_do_rename(ofs, old_upperdir->d_inode, olddentry, - new_upperdir->d_inode, newdentry, flags); + err = ovl_do_rename(ofs, old_upperdir, olddentry, + new_upperdir, newdentry, flags); + unlock_rename(new_upperdir, old_upperdir); if (err) - goto out_dput; + goto out_revert_creds; if (cleanup_whiteout) - ovl_cleanup(ofs, old_upperdir->d_inode, newdentry); + ovl_cleanup(ofs, old_upperdir, newdentry); if (overwrite && d_inode(new)) { if (new_is_dir) @@ -1271,12 +1289,6 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, if (d_inode(new) && ovl_dentry_upper(new)) ovl_copyattr(d_inode(new)); -out_dput: - dput(newdentry); -out_dput_old: - dput(olddentry); -out_unlock: - unlock_rename(new_upperdir, old_upperdir); out_revert_creds: ovl_revert_creds(old_cred); if (update_nlink) @@ -1284,9 +1296,15 @@ out_revert_creds: else ovl_drop_write(old); out: + dput(newdentry); + dput(olddentry); dput(opaquedir); ovl_cache_free(&list); return err; + +out_unlock: + unlock_rename(new_upperdir, old_upperdir); + goto out_revert_creds; } static int ovl_create_tmpfile(struct file *file, struct dentry *dentry, diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 969b458100fe..f5b8877d5fe2 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -48,8 +48,8 @@ static struct file *ovl_open_realfile(const struct file *file, if (!inode_owner_or_capable(real_idmap, realinode)) flags &= ~O_NOATIME; - realfile = backing_file_open(&file->f_path, flags, realpath, - current_cred()); + realfile = backing_file_open(file_user_path(file), + flags, realpath, current_cred()); } ovl_revert_creds(old_cred); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 6f0e15f86c21..ecb9f2019395 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -610,7 +610,7 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * Introducing security_inode_fileattr_get/set() hooks would solve this issue * properly. */ -static int ovl_security_fileattr(const struct path *realpath, struct fileattr *fa, +static int ovl_security_fileattr(const struct path *realpath, struct file_kattr *fa, bool set) { struct file *file; @@ -637,7 +637,7 @@ static int ovl_security_fileattr(const struct path *realpath, struct fileattr *f return err; } -int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa) +int ovl_real_fileattr_set(const struct path *realpath, struct file_kattr *fa) { int err; @@ -649,7 +649,7 @@ int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa) } int ovl_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct path upperpath; @@ -697,7 +697,7 @@ out: } /* Convert inode protection flags to fileattr flags */ -static void ovl_fileattr_prot_flags(struct inode *inode, struct fileattr *fa) +static void ovl_fileattr_prot_flags(struct inode *inode, struct file_kattr *fa) { BUILD_BUG_ON(OVL_PROT_FS_FLAGS_MASK & ~FS_COMMON_FL); BUILD_BUG_ON(OVL_PROT_FSX_FLAGS_MASK & ~FS_XFLAG_COMMON); @@ -712,7 +712,7 @@ static void ovl_fileattr_prot_flags(struct inode *inode, struct fileattr *fa) } } -int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa) +int ovl_real_fileattr_get(const struct path *realpath, struct file_kattr *fa) { int err; @@ -720,13 +720,10 @@ int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa) if (err) return err; - err = vfs_fileattr_get(realpath->dentry, fa); - if (err == -ENOIOCTLCMD) - err = -ENOTTY; - return err; + return vfs_fileattr_get(realpath->dentry, fa); } -int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int ovl_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct path realpath; diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index bf722daf19a9..76d6248b625e 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -16,6 +16,7 @@ struct ovl_lookup_data { struct super_block *sb; + struct dentry *dentry; const struct ovl_layer *layer; struct qstr name; bool is_dir; @@ -24,6 +25,7 @@ struct ovl_lookup_data { bool stop; bool last; char *redirect; + char *upperredirect; int metacopy; /* Referring to last redirect xattr */ bool absolute_redirect; @@ -228,13 +230,26 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, struct dentry **ret, bool drop_negative) { struct ovl_fs *ofs = OVL_FS(d->sb); - struct dentry *this; + struct dentry *this = NULL; + const char *warn; struct path path; int err; bool last_element = !post[0]; bool is_upper = d->layer->idx == 0; char val; + /* + * We allow filesystems that are case-folding capable but deny composing + * ovl stack from case-folded directories. If someone has enabled case + * folding on a directory on underlying layer, the warranty of the ovl + * stack is voided. + */ + if (ovl_dentry_casefolded(base)) { + warn = "case folded parent"; + err = -ESTALE; + goto out_warn; + } + this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative); if (IS_ERR(this)) { err = PTR_ERR(this); @@ -244,10 +259,17 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, goto out_err; } + if (ovl_dentry_casefolded(this)) { + warn = "case folded child"; + err = -EREMOTE; + goto out_warn; + } + if (ovl_dentry_weird(this)) { /* Don't support traversing automounts and other weirdness */ + warn = "unsupported object type"; err = -EREMOTE; - goto out_err; + goto out_warn; } path.dentry = this; @@ -281,8 +303,9 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, } else { if (ovl_lookup_trap_inode(d->sb, this)) { /* Caught in a trap of overlapping layers */ + warn = "overlapping layers"; err = -ELOOP; - goto out_err; + goto out_warn; } if (last_element) @@ -314,6 +337,10 @@ put_and_out: this = NULL; goto out; +out_warn: + pr_warn_ratelimited("failed lookup in %s (%pd2, name='%.*s', err=%i): %s\n", + is_upper ? "upper" : "lower", base, + namelen, name, err, warn); out_err: dput(this); return err; @@ -1024,6 +1051,31 @@ int ovl_verify_lowerdata(struct dentry *dentry) return ovl_maybe_validate_verity(dentry); } +/* + * Following redirects/metacopy can have security consequences: it's like a + * symlink into the lower layer without the permission checks. + * + * This is only a problem if the upper layer is untrusted (e.g comes from an USB + * drive). This can allow a non-readable file or directory to become readable. + * + * Only following redirects when redirects are enabled disables this attack + * vector when not necessary. + */ +static bool ovl_check_follow_redirect(struct ovl_lookup_data *d) +{ + struct ovl_fs *ofs = OVL_FS(d->sb); + + if (d->metacopy && !ofs->config.metacopy) { + pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", d->dentry); + return false; + } + if ((d->redirect || d->upperredirect) && !ovl_redirect_follow(ofs)) { + pr_warn_ratelimited("refusing to follow redirect for (%pd2)\n", d->dentry); + return false; + } + return true; +} + struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { @@ -1039,7 +1091,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int ctr = 0; struct inode *inode = NULL; bool upperopaque = false; - char *upperredirect = NULL; + bool check_redirect = (ovl_redirect_follow(ofs) || ofs->numdatalayer); struct dentry *this; unsigned int i; int err; @@ -1047,12 +1099,14 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, int metacopy_size = 0; struct ovl_lookup_data d = { .sb = dentry->d_sb, + .dentry = dentry, .name = dentry->d_name, .is_dir = false, .opaque = false, .stop = false, - .last = ovl_redirect_follow(ofs) ? false : !ovl_numlower(poe), + .last = check_redirect ? false : !ovl_numlower(poe), .redirect = NULL, + .upperredirect = NULL, .metacopy = 0, }; @@ -1094,8 +1148,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (d.redirect) { err = -ENOMEM; - upperredirect = kstrdup(d.redirect, GFP_KERNEL); - if (!upperredirect) + d.upperredirect = kstrdup(d.redirect, GFP_KERNEL); + if (!d.upperredirect) goto out_put_upper; if (d.redirect[0] == '/') poe = roe; @@ -1113,7 +1167,12 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, for (i = 0; !d.stop && i < ovl_numlower(poe); i++) { struct ovl_path lower = ovl_lowerstack(poe)[i]; - if (!ovl_redirect_follow(ofs)) + if (!ovl_check_follow_redirect(&d)) { + err = -EPERM; + goto out_put; + } + + if (!check_redirect) d.last = i == ovl_numlower(poe) - 1; else if (d.is_dir || !ofs->numdatalayer) d.last = lower.layer->idx == ovl_numlower(roe); @@ -1126,13 +1185,6 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (!this) continue; - if ((uppermetacopy || d.metacopy) && !ofs->config.metacopy) { - dput(this); - err = -EPERM; - pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", dentry); - goto out_put; - } - /* * If no origin fh is stored in upper of a merge dir, store fh * of lower dir and set upper parent "impure". @@ -1185,23 +1237,6 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, ctr++; } - /* - * Following redirects can have security consequences: it's like - * a symlink into the lower layer without the permission checks. - * This is only a problem if the upper layer is untrusted (e.g - * comes from an USB drive). This can allow a non-readable file - * or directory to become readable. - * - * Only following redirects when redirects are enabled disables - * this attack vector when not necessary. - */ - err = -EPERM; - if (d.redirect && !ovl_redirect_follow(ofs)) { - pr_warn_ratelimited("refusing to follow redirect for (%pd2)\n", - dentry); - goto out_put; - } - if (d.stop) break; @@ -1212,10 +1247,16 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, } } - /* Defer lookup of lowerdata in data-only layers to first access */ + /* + * Defer lookup of lowerdata in data-only layers to first access. + * Don't require redirect=follow and metacopy=on in this case. + */ if (d.metacopy && ctr && ofs->numdatalayer && d.absolute_redirect) { d.metacopy = 0; ctr++; + } else if (!ovl_check_follow_redirect(&d)) { + err = -EPERM; + goto out_put; } /* @@ -1298,20 +1339,26 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, /* * It's safe to assign upperredirect here: the previous - * assignment of happens only if upperdentry is non-NULL, and + * assignment happens only if upperdentry is non-NULL, and * this one only if upperdentry is NULL. */ - upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0); - if (IS_ERR(upperredirect)) { - err = PTR_ERR(upperredirect); - upperredirect = NULL; + d.upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0); + if (IS_ERR(d.upperredirect)) { + err = PTR_ERR(d.upperredirect); + d.upperredirect = NULL; goto out_free_oe; } + err = ovl_check_metacopy_xattr(ofs, &upperpath, NULL); if (err < 0) goto out_free_oe; - uppermetacopy = err; + d.metacopy = uppermetacopy = err; metacopy_size = err; + + if (!ovl_check_follow_redirect(&d)) { + err = -EPERM; + goto out_free_oe; + } } if (upperdentry || ctr) { @@ -1319,7 +1366,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, .upperdentry = upperdentry, .oe = oe, .index = index, - .redirect = upperredirect, + .redirect = d.upperredirect, }; /* Store lowerdata redirect for lazy lookup */ @@ -1361,7 +1408,7 @@ out_put_upper: kfree(origin_path); } dput(upperdentry); - kfree(upperredirect); + kfree(d.upperredirect); out: kfree(d.redirect); ovl_revert_creds(old_cred); @@ -1371,7 +1418,7 @@ out: bool ovl_lower_positive(struct dentry *dentry) { struct ovl_entry *poe = OVL_E(dentry->d_parent); - struct qstr *name = &dentry->d_name; + const struct qstr *name = &dentry->d_name; const struct cred *old_cred; unsigned int i; bool positive = false; @@ -1394,9 +1441,15 @@ bool ovl_lower_positive(struct dentry *dentry) struct dentry *this; struct ovl_path *parentpath = &ovl_lowerstack(poe)[i]; + /* + * We need to make a non-const copy of dentry->d_name, + * because lookup_one_positive_unlocked() will hash name + * with parentpath base, which is on another (lower fs). + */ this = lookup_one_positive_unlocked( mnt_idmap(parentpath->layer->mnt), - name, parentpath->dentry); + &QSTR_LEN(name->name, name->len), + parentpath->dentry); if (IS_ERR(this)) { switch (PTR_ERR(this)) { case -ENOENT: diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 8baaba0a3fe5..bb0d7ded8e76 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -246,9 +246,11 @@ static inline struct dentry *ovl_do_mkdir(struct ovl_fs *ofs, struct dentry *dentry, umode_t mode) { - dentry = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode); - pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, PTR_ERR_OR_ZERO(dentry)); - return dentry; + struct dentry *ret; + + ret = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode); + pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, PTR_ERR_OR_ZERO(ret)); + return ret; } static inline int ovl_do_mknod(struct ovl_fs *ofs, @@ -353,19 +355,19 @@ static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry, return vfs_remove_acl(ovl_upper_mnt_idmap(ofs), dentry, acl_name); } -static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir, - struct dentry *olddentry, struct inode *newdir, +static inline int ovl_do_rename(struct ovl_fs *ofs, struct dentry *olddir, + struct dentry *olddentry, struct dentry *newdir, struct dentry *newdentry, unsigned int flags) { int err; struct renamedata rd = { .old_mnt_idmap = ovl_upper_mnt_idmap(ofs), - .old_dir = olddir, - .old_dentry = olddentry, + .old_parent = olddir, + .old_dentry = olddentry, .new_mnt_idmap = ovl_upper_mnt_idmap(ofs), - .new_dir = newdir, - .new_dentry = newdentry, - .flags = flags, + .new_parent = newdir, + .new_dentry = newdentry, + .flags = flags, }; pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags); @@ -405,6 +407,15 @@ static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs, return lookup_one(ovl_upper_mnt_idmap(ofs), &QSTR_LEN(name, len), base); } +static inline struct dentry *ovl_lookup_upper_unlocked(struct ovl_fs *ofs, + const char *name, + struct dentry *base, + int len) +{ + return lookup_one_unlocked(ovl_upper_mnt_idmap(ofs), + &QSTR_LEN(name, len), base); +} + static inline bool ovl_open_flags_need_copy_up(int flags) { if (!flags) @@ -414,6 +425,11 @@ static inline bool ovl_open_flags_need_copy_up(int flags) } /* util.c */ +int ovl_parent_lock(struct dentry *parent, struct dentry *child); +static inline void ovl_parent_unlock(struct dentry *parent) +{ + inode_unlock(parent->d_inode); +} int ovl_get_write_access(struct dentry *dentry); void ovl_put_write_access(struct dentry *dentry); void ovl_start_write(struct dentry *dentry); @@ -446,6 +462,12 @@ void ovl_dentry_init_reval(struct dentry *dentry, struct dentry *upperdentry, void ovl_dentry_init_flags(struct dentry *dentry, struct dentry *upperdentry, struct ovl_entry *oe, unsigned int mask); bool ovl_dentry_weird(struct dentry *dentry); + +static inline bool ovl_dentry_casefolded(struct dentry *dentry) +{ + return sb_has_encoding(dentry->d_sb) && IS_CASEFOLDED(d_inode(dentry)); +} + enum ovl_path_type ovl_path_type(struct dentry *dentry); void ovl_path_upper(struct dentry *dentry, struct path *path); void ovl_path_lower(struct dentry *dentry, struct path *path); @@ -533,7 +555,8 @@ bool ovl_is_inuse(struct dentry *dentry); bool ovl_need_index(struct dentry *dentry); int ovl_nlink_start(struct dentry *dentry); void ovl_nlink_end(struct dentry *dentry); -int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); +int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *work, + struct dentry *upperdir, struct dentry *upper); int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path, struct ovl_metacopy *data); int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d, @@ -721,7 +744,7 @@ void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper, void ovl_cache_free(struct list_head *list); void ovl_dir_cache_free(struct inode *inode); int ovl_check_d_type_supported(const struct path *realpath); -int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir, +int ovl_workdir_cleanup(struct ovl_fs *ofs, struct dentry *parent, struct vfsmount *mnt, struct dentry *dentry, int level); int ovl_indexdir_cleanup(struct ovl_fs *ofs); @@ -815,7 +838,7 @@ void ovl_copyattr(struct inode *to); void ovl_check_protattr(struct inode *inode, struct dentry *upper); int ovl_set_protattr(struct inode *inode, struct dentry *upper, - struct fileattr *fa); + struct file_kattr *fa); static inline void ovl_copyflags(struct inode *from, struct inode *to) { @@ -826,7 +849,7 @@ static inline void ovl_copyflags(struct inode *from, struct inode *to) /* dir.c */ extern const struct inode_operations ovl_dir_inode_operations; -int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir, +int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct dentry *dir, struct dentry *dentry); struct ovl_cattr { dev_t rdev; @@ -838,20 +861,20 @@ struct ovl_cattr { #define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) }) struct dentry *ovl_create_real(struct ovl_fs *ofs, - struct inode *dir, struct dentry *newdentry, + struct dentry *parent, struct dentry *newdentry, struct ovl_cattr *attr); -int ovl_cleanup(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry); +int ovl_cleanup(struct ovl_fs *ofs, struct dentry *workdir, struct dentry *dentry); struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir); struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, struct ovl_cattr *attr); /* file.c */ extern const struct file_operations ovl_file_operations; -int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa); -int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa); -int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int ovl_real_fileattr_get(const struct path *realpath, struct file_kattr *fa); +int ovl_real_fileattr_set(const struct path *realpath, struct file_kattr *fa); +int ovl_fileattr_get(struct dentry *dentry, struct file_kattr *fa); int ovl_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); struct ovl_file; struct ovl_file *ovl_file_alloc(struct file *realfile); void ovl_file_free(struct ovl_file *of); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index cb449ab310a7..4c1bae935ced 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -51,7 +51,7 @@ struct ovl_path { struct ovl_entry { unsigned int __numlower; - struct ovl_path __lowerstack[]; + struct ovl_path __lowerstack[] __counted_by(__numlower); }; /* private information held for overlayfs's superblock */ @@ -88,6 +88,7 @@ struct ovl_fs { /* Shared whiteout cache */ struct dentry *whiteout; bool no_shared_whiteout; + struct mutex whiteout_lock; /* r/o snapshot of upperdir sb's only taken on volatile mounts */ errseq_t errseq; }; diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index 6759f7d040c8..f4e7fff909ac 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -282,13 +282,11 @@ static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path, return invalfc(fc, "%s is not a directory", name); /* - * Root dentries of case-insensitive capable filesystems might - * not have the dentry operations set, but still be incompatible - * with overlayfs. Check explicitly to prevent post-mount - * failures. + * Allow filesystems that are case-folding capable but deny composing + * ovl stack from case-folded directories. */ - if (sb_has_encoding(path->mnt->mnt_sb)) - return invalfc(fc, "case-insensitive capable filesystem on %s not supported", name); + if (ovl_dentry_casefolded(path->dentry)) + return invalfc(fc, "case-insensitive directory on %s not supported", name); if (ovl_dentry_weird(path->dentry)) return invalfc(fc, "filesystem on %s not supported", name); @@ -797,6 +795,8 @@ int ovl_init_fs_context(struct fs_context *fc) fc->s_fs_info = ofs; fc->fs_private = ctx; fc->ops = &ovl_context_ops; + + mutex_init(&ofs->whiteout_lock); return 0; out_err: @@ -871,18 +871,6 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, config->uuid = OVL_UUID_NULL; } - /* Resolve verity -> metacopy dependency */ - if (config->verity_mode && !config->metacopy) { - /* Don't allow explicit specified conflicting combinations */ - if (set.metacopy) { - pr_err("conflicting options: metacopy=off,verity=%s\n", - ovl_verity_mode(config)); - return -EINVAL; - } - /* Otherwise automatically enable metacopy. */ - config->metacopy = true; - } - /* * This is to make the logic below simpler. It doesn't make any other * difference, since redirect_dir=on is only used for upper. @@ -890,18 +878,13 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, if (!config->upperdir && config->redirect_mode == OVL_REDIRECT_FOLLOW) config->redirect_mode = OVL_REDIRECT_ON; - /* Resolve verity -> metacopy -> redirect_dir dependency */ + /* metacopy -> redirect_dir dependency */ if (config->metacopy && config->redirect_mode != OVL_REDIRECT_ON) { if (set.metacopy && set.redirect) { pr_err("conflicting options: metacopy=on,redirect_dir=%s\n", ovl_redirect_mode(config)); return -EINVAL; } - if (config->verity_mode && set.redirect) { - pr_err("conflicting options: verity=%s,redirect_dir=%s\n", - ovl_verity_mode(config), ovl_redirect_mode(config)); - return -EINVAL; - } if (set.redirect) { /* * There was an explicit redirect_dir=... that resulted @@ -970,7 +953,7 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, } - /* Resolve userxattr -> !redirect && !metacopy && !verity dependency */ + /* Resolve userxattr -> !redirect && !metacopy dependency */ if (config->userxattr) { if (set.redirect && config->redirect_mode != OVL_REDIRECT_NOFOLLOW) { @@ -982,11 +965,6 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, pr_err("conflicting options: userxattr,metacopy=on\n"); return -EINVAL; } - if (config->verity_mode) { - pr_err("conflicting options: userxattr,verity=%s\n", - ovl_verity_mode(config)); - return -EINVAL; - } /* * Silently disable default setting of redirect and metacopy. * This shall be the default in the future as well: these @@ -1025,11 +1003,6 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, */ } - if (ctx->nr_data > 0 && !config->metacopy) { - pr_err("lower data-only dirs require metacopy support.\n"); - return -EINVAL; - } - return 0; } @@ -1078,17 +1051,16 @@ int ovl_show_options(struct seq_file *m, struct dentry *dentry) seq_printf(m, ",redirect_dir=%s", ovl_redirect_mode(&ofs->config)); if (ofs->config.index != ovl_index_def) - seq_printf(m, ",index=%s", ofs->config.index ? "on" : "off"); + seq_printf(m, ",index=%s", str_on_off(ofs->config.index)); if (ofs->config.uuid != ovl_uuid_def()) seq_printf(m, ",uuid=%s", ovl_uuid_mode(&ofs->config)); if (ofs->config.nfs_export != ovl_nfs_export_def) - seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ? - "on" : "off"); + seq_printf(m, ",nfs_export=%s", + str_on_off(ofs->config.nfs_export)); if (ofs->config.xino != ovl_xino_def() && !ovl_same_fs(ofs)) seq_printf(m, ",xino=%s", ovl_xino_mode(&ofs->config)); if (ofs->config.metacopy != ovl_metacopy_def) - seq_printf(m, ",metacopy=%s", - ofs->config.metacopy ? "on" : "off"); + seq_printf(m, ",metacopy=%s", str_on_off(ofs->config.metacopy)); if (ofs->config.ovl_volatile) seq_puts(m, ",volatile"); if (ofs->config.userxattr) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 44e208da417c..b65cdfce31ce 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -13,6 +13,7 @@ #include <linux/security.h> #include <linux/cred.h> #include <linux/ratelimit.h> +#include <linux/overflow.h> #include "overlayfs.h" struct ovl_cache_entry { @@ -147,9 +148,8 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; - size_t size = offsetof(struct ovl_cache_entry, name[len + 1]); - p = kmalloc(size, GFP_KERNEL); + p = kmalloc(struct_size(p, name, len + 1), GFP_KERNEL); if (!p) return NULL; @@ -1034,14 +1034,13 @@ void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper, { struct ovl_cache_entry *p; - inode_lock_nested(upper->d_inode, I_MUTEX_CHILD); list_for_each_entry(p, list, l_node) { struct dentry *dentry; if (WARN_ON(!p->is_whiteout || !p->is_upper)) continue; - dentry = ovl_lookup_upper(ofs, p->name, upper, p->len); + dentry = ovl_lookup_upper_unlocked(ofs, p->name, upper, p->len); if (IS_ERR(dentry)) { pr_err("lookup '%s/%.*s' failed (%i)\n", upper->d_name.name, p->len, p->name, @@ -1049,10 +1048,9 @@ void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper, continue; } if (dentry->d_inode) - ovl_cleanup(ofs, upper->d_inode, dentry); + ovl_cleanup(ofs, upper, dentry); dput(dentry); } - inode_unlock(upper->d_inode); } static bool ovl_check_d_type(struct dir_context *ctx, const char *name, @@ -1098,7 +1096,6 @@ static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *pa int level) { int err; - struct inode *dir = path->dentry->d_inode; LIST_HEAD(list); struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { @@ -1124,7 +1121,6 @@ static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *pa if (err) goto out; - inode_lock_nested(dir, I_MUTEX_PARENT); list_for_each_entry(p, &list, l_node) { struct dentry *dentry; @@ -1139,39 +1135,40 @@ static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *pa err = -EINVAL; break; } - dentry = ovl_lookup_upper(ofs, p->name, path->dentry, p->len); + dentry = ovl_lookup_upper_unlocked(ofs, p->name, path->dentry, p->len); if (IS_ERR(dentry)) continue; if (dentry->d_inode) - err = ovl_workdir_cleanup(ofs, dir, path->mnt, dentry, level); + err = ovl_workdir_cleanup(ofs, path->dentry, path->mnt, + dentry, level); dput(dentry); if (err) break; } - inode_unlock(dir); out: ovl_cache_free(&list); return err; } -int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir, +int ovl_workdir_cleanup(struct ovl_fs *ofs, struct dentry *parent, struct vfsmount *mnt, struct dentry *dentry, int level) { int err; - if (!d_is_dir(dentry) || level > 1) { - return ovl_cleanup(ofs, dir, dentry); - } + if (!d_is_dir(dentry) || level > 1) + return ovl_cleanup(ofs, parent, dentry); - err = ovl_do_rmdir(ofs, dir, dentry); + err = ovl_parent_lock(parent, dentry); + if (err) + return err; + err = ovl_do_rmdir(ofs, parent->d_inode, dentry); + ovl_parent_unlock(parent); if (err) { struct path path = { .mnt = mnt, .dentry = dentry }; - inode_unlock(dir); err = ovl_workdir_cleanup_recurse(ofs, &path, level + 1); - inode_lock_nested(dir, I_MUTEX_PARENT); if (!err) - err = ovl_cleanup(ofs, dir, dentry); + err = ovl_cleanup(ofs, parent, dentry); } return err; @@ -1182,7 +1179,6 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) int err; struct dentry *indexdir = ofs->workdir; struct dentry *index = NULL; - struct inode *dir = indexdir->d_inode; struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = indexdir }; LIST_HEAD(list); struct ovl_cache_entry *p; @@ -1196,7 +1192,6 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) if (err) goto out; - inode_lock_nested(dir, I_MUTEX_PARENT); list_for_each_entry(p, &list, l_node) { if (p->name[0] == '.') { if (p->len == 1) @@ -1204,7 +1199,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) if (p->len == 2 && p->name[1] == '.') continue; } - index = ovl_lookup_upper(ofs, p->name, indexdir, p->len); + index = ovl_lookup_upper_unlocked(ofs, p->name, indexdir, p->len); if (IS_ERR(index)) { err = PTR_ERR(index); index = NULL; @@ -1212,7 +1207,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) } /* Cleanup leftover from index create/cleanup attempt */ if (index->d_name.name[0] == '#') { - err = ovl_workdir_cleanup(ofs, dir, path.mnt, index, 1); + err = ovl_workdir_cleanup(ofs, indexdir, path.mnt, index, 1); if (err) break; goto next; @@ -1222,7 +1217,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) goto next; } else if (err == -ESTALE) { /* Cleanup stale index entries */ - err = ovl_cleanup(ofs, dir, index); + err = ovl_cleanup(ofs, indexdir, index); } else if (err != -ENOENT) { /* * Abort mount to avoid corrupting the index if @@ -1235,10 +1230,10 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) * Whiteout orphan index to block future open by * handle after overlay nlink dropped to zero. */ - err = ovl_cleanup_and_whiteout(ofs, dir, index); + err = ovl_cleanup_and_whiteout(ofs, indexdir, index); } else { /* Cleanup orphan index entries */ - err = ovl_cleanup(ofs, dir, index); + err = ovl_cleanup(ofs, indexdir, index); } if (err) @@ -1249,7 +1244,6 @@ next: index = NULL; } dput(index); - inode_unlock(dir); out: ovl_cache_free(&list); if (err) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index e19940d649ca..df85a76597e9 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -299,8 +299,8 @@ static struct dentry *ovl_workdir_create(struct ovl_fs *ofs, int err; bool retried = false; - inode_lock_nested(dir, I_MUTEX_PARENT); retry: + inode_lock_nested(dir, I_MUTEX_PARENT); work = ovl_lookup_upper(ofs, name, ofs->workbasedir, strlen(name)); if (!IS_ERR(work)) { @@ -311,23 +311,24 @@ retry: if (work->d_inode) { err = -EEXIST; + inode_unlock(dir); if (retried) goto out_dput; if (persist) - goto out_unlock; + return work; retried = true; - err = ovl_workdir_cleanup(ofs, dir, mnt, work, 0); + err = ovl_workdir_cleanup(ofs, ofs->workbasedir, mnt, work, 0); dput(work); - if (err == -EINVAL) { - work = ERR_PTR(err); - goto out_unlock; - } + if (err == -EINVAL) + return ERR_PTR(err); + goto retry; } work = ovl_do_mkdir(ofs, dir, work, attr.ia_mode); + inode_unlock(dir); err = PTR_ERR(work); if (IS_ERR(work)) goto out_err; @@ -365,11 +366,10 @@ retry: if (err) goto out_dput; } else { + inode_unlock(dir); err = PTR_ERR(work); goto out_err; } -out_unlock: - inode_unlock(dir); return work; out_dput: @@ -377,8 +377,7 @@ out_dput: out_err: pr_warn("failed to create directory %s/%s (errno: %i); mounting read-only\n", ofs->config.workdir, name, -err); - work = NULL; - goto out_unlock; + return NULL; } static int ovl_check_namelen(const struct path *path, struct ovl_fs *ofs, @@ -557,37 +556,42 @@ out: static int ovl_check_rename_whiteout(struct ovl_fs *ofs) { struct dentry *workdir = ofs->workdir; - struct inode *dir = d_inode(workdir); struct dentry *temp; struct dentry *dest; struct dentry *whiteout; struct name_snapshot name; int err; - inode_lock_nested(dir, I_MUTEX_PARENT); - temp = ovl_create_temp(ofs, workdir, OVL_CATTR(S_IFREG | 0)); err = PTR_ERR(temp); if (IS_ERR(temp)) - goto out_unlock; + return err; + err = ovl_parent_lock(workdir, temp); + if (err) { + dput(temp); + return err; + } dest = ovl_lookup_temp(ofs, workdir); err = PTR_ERR(dest); if (IS_ERR(dest)) { dput(temp); - goto out_unlock; + ovl_parent_unlock(workdir); + return err; } /* Name is inline and stable - using snapshot as a copy helper */ take_dentry_name_snapshot(&name, temp); - err = ovl_do_rename(ofs, dir, temp, dir, dest, RENAME_WHITEOUT); + err = ovl_do_rename(ofs, workdir, temp, workdir, dest, RENAME_WHITEOUT); + ovl_parent_unlock(workdir); if (err) { if (err == -EINVAL) err = 0; goto cleanup_temp; } - whiteout = ovl_lookup_upper(ofs, name.name.name, workdir, name.name.len); + whiteout = ovl_lookup_upper_unlocked(ofs, name.name.name, + workdir, name.name.len); err = PTR_ERR(whiteout); if (IS_ERR(whiteout)) goto cleanup_temp; @@ -596,18 +600,15 @@ static int ovl_check_rename_whiteout(struct ovl_fs *ofs) /* Best effort cleanup of whiteout and temp file */ if (err) - ovl_cleanup(ofs, dir, whiteout); + ovl_cleanup(ofs, workdir, whiteout); dput(whiteout); cleanup_temp: - ovl_cleanup(ofs, dir, temp); + ovl_cleanup(ofs, workdir, temp); release_dentry_name_snapshot(&name); dput(temp); dput(dest); -out_unlock: - inode_unlock(dir); - return err; } @@ -621,8 +622,7 @@ static struct dentry *ovl_lookup_or_create(struct ovl_fs *ofs, inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); child = ovl_lookup_upper(ofs, name, parent, len); if (!IS_ERR(child) && !child->d_inode) - child = ovl_create_real(ofs, parent->d_inode, child, - OVL_CATTR(mode)); + child = ovl_create_real(ofs, parent, child, OVL_CATTR(mode)); inode_unlock(parent->d_inode); dput(parent); @@ -1322,7 +1322,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) if (WARN_ON(fc->user_ns != current_user_ns())) goto out_err; - sb->s_d_op = &ovl_dentry_operations; + set_default_d_op(sb, &ovl_dentry_operations); err = -ENOMEM; if (!ofs->creator_cred) diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 0819c739cc2f..a33115e7384c 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -15,6 +15,7 @@ #include <linux/uuid.h> #include <linux/namei.h> #include <linux/ratelimit.h> +#include <linux/overflow.h> #include "overlayfs.h" /* Get write access to upper mnt - may fail if upper sb was remounted ro */ @@ -145,9 +146,9 @@ void ovl_stack_free(struct ovl_path *stack, unsigned int n) struct ovl_entry *ovl_alloc_entry(unsigned int numlower) { - size_t size = offsetof(struct ovl_entry, __lowerstack[numlower]); - struct ovl_entry *oe = kzalloc(size, GFP_KERNEL); + struct ovl_entry *oe; + oe = kzalloc(struct_size(oe, __lowerstack, numlower), GFP_KERNEL); if (oe) oe->__numlower = numlower; @@ -205,10 +206,17 @@ bool ovl_dentry_weird(struct dentry *dentry) if (!d_can_lookup(dentry) && !d_is_file(dentry) && !d_is_symlink(dentry)) return true; - return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT | - DCACHE_MANAGE_TRANSIT | - DCACHE_OP_HASH | - DCACHE_OP_COMPARE); + if (dentry->d_flags & (DCACHE_NEED_AUTOMOUNT | DCACHE_MANAGE_TRANSIT)) + return true; + + /* + * Allow filesystems that are case-folding capable but deny composing + * ovl stack from case-folded directories. + */ + if (sb_has_encoding(dentry->d_sb)) + return IS_CASEFOLDED(d_inode(dentry)); + + return dentry->d_flags & (DCACHE_OP_HASH | DCACHE_OP_COMPARE); } enum ovl_path_type ovl_path_type(struct dentry *dentry) @@ -305,7 +313,9 @@ enum ovl_path_type ovl_path_realdata(struct dentry *dentry, struct path *path) struct dentry *ovl_dentry_upper(struct dentry *dentry) { - return ovl_upperdentry_dereference(OVL_I(d_inode(dentry))); + struct inode *inode = d_inode(dentry); + + return inode ? ovl_upperdentry_dereference(OVL_I(inode)) : NULL; } struct dentry *ovl_dentry_lower(struct dentry *dentry) @@ -956,7 +966,7 @@ void ovl_check_protattr(struct inode *inode, struct dentry *upper) } int ovl_set_protattr(struct inode *inode, struct dentry *upper, - struct fileattr *fa) + struct file_kattr *fa) { struct ovl_fs *ofs = OVL_FS(inode->i_sb); char buf[OVL_PROTATTR_MAX]; @@ -1068,7 +1078,6 @@ static void ovl_cleanup_index(struct dentry *dentry) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *indexdir = ovl_indexdir(dentry->d_sb); - struct inode *dir = indexdir->d_inode; struct dentry *lowerdentry = ovl_dentry_lower(dentry); struct dentry *upperdentry = ovl_dentry_upper(dentry); struct dentry *index = NULL; @@ -1104,21 +1113,18 @@ static void ovl_cleanup_index(struct dentry *dentry) goto out; } - inode_lock_nested(dir, I_MUTEX_PARENT); - index = ovl_lookup_upper(ofs, name.name, indexdir, name.len); + index = ovl_lookup_upper_unlocked(ofs, name.name, indexdir, name.len); err = PTR_ERR(index); if (IS_ERR(index)) { index = NULL; } else if (ovl_index_all(dentry->d_sb)) { /* Whiteout orphan index to block future open by handle */ err = ovl_cleanup_and_whiteout(OVL_FS(dentry->d_sb), - dir, index); + indexdir, index); } else { /* Cleanup orphan index entries */ - err = ovl_cleanup(ofs, dir, index); + err = ovl_cleanup(ofs, indexdir, index); } - - inode_unlock(dir); if (err) goto fail; @@ -1217,20 +1223,21 @@ void ovl_nlink_end(struct dentry *dentry) ovl_inode_unlock(inode); } -int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir) +int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *work, + struct dentry *upperdir, struct dentry *upper) { struct dentry *trap; - /* Workdir should not be the same as upperdir */ - if (workdir == upperdir) - goto err; - /* Workdir should not be subdir of upperdir and vice versa */ trap = lock_rename(workdir, upperdir); if (IS_ERR(trap)) goto err; if (trap) goto err_unlock; + if (work && work->d_parent != workdir) + goto err_unlock; + if (upper && upper->d_parent != upperdir) + goto err_unlock; return 0; @@ -1541,3 +1548,13 @@ void ovl_copyattr(struct inode *inode) i_size_write(inode, i_size_read(realinode)); spin_unlock(&inode->i_lock); } + +int ovl_parent_lock(struct dentry *parent, struct dentry *child) +{ + inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); + if (!child || child->d_parent == parent) + return 0; + + inode_unlock(parent->d_inode); + return -EINVAL; +} diff --git a/fs/pidfs.c b/fs/pidfs.c index c1f0a067be40..edc35522d75c 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -21,11 +21,23 @@ #include <linux/utsname.h> #include <net/net_namespace.h> #include <linux/coredump.h> +#include <linux/xattr.h> #include "internal.h" #include "mount.h" -static struct kmem_cache *pidfs_cachep __ro_after_init; +#define PIDFS_PID_DEAD ERR_PTR(-ESRCH) + +static struct kmem_cache *pidfs_attr_cachep __ro_after_init; +static struct kmem_cache *pidfs_xattr_cachep __ro_after_init; + +static struct path pidfs_root_path = {}; + +void pidfs_get_root(struct path *path) +{ + *path = pidfs_root_path; + path_get(path); +} /* * Stashes information that userspace needs to access even after the @@ -37,17 +49,12 @@ struct pidfs_exit_info { __u32 coredump_mask; }; -struct pidfs_inode { +struct pidfs_attr { + struct simple_xattrs *xattrs; struct pidfs_exit_info __pei; struct pidfs_exit_info *exit_info; - struct inode vfs_inode; }; -static inline struct pidfs_inode *pidfs_i(struct inode *inode) -{ - return container_of(inode, struct pidfs_inode, vfs_inode); -} - static struct rb_root pidfs_ino_tree = RB_ROOT; #if BITS_PER_LONG == 32 @@ -125,6 +132,7 @@ void pidfs_add_pid(struct pid *pid) pid->ino = pidfs_ino_nr; pid->stashed = NULL; + pid->attr = NULL; pidfs_ino_nr++; write_seqcount_begin(&pidmap_lock_seq); @@ -139,6 +147,33 @@ void pidfs_remove_pid(struct pid *pid) write_seqcount_end(&pidmap_lock_seq); } +void pidfs_free_pid(struct pid *pid) +{ + struct pidfs_attr *attr __free(kfree) = no_free_ptr(pid->attr); + struct simple_xattrs *xattrs __free(kfree) = NULL; + + /* + * Any dentry must've been wiped from the pid by now. + * Otherwise there's a reference count bug. + */ + VFS_WARN_ON_ONCE(pid->stashed); + + /* + * This if an error occurred during e.g., task creation that + * causes us to never go through the exit path. + */ + if (unlikely(!attr)) + return; + + /* This never had a pidfd created. */ + if (IS_ERR(attr)) + return; + + xattrs = no_free_ptr(attr->xattrs); + if (xattrs) + simple_xattrs_free(xattrs, NULL); +} + #ifdef CONFIG_PROC_FS /** * pidfd_show_fdinfo - print information about a pidfd @@ -261,13 +296,13 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags) static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) { struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; - struct inode *inode = file_inode(file); struct pid *pid = pidfd_pid(file); size_t usize = _IOC_SIZE(cmd); struct pidfd_info kinfo = {}; struct pidfs_exit_info *exit_info; struct user_namespace *user_ns; struct task_struct *task; + struct pidfs_attr *attr; const struct cred *c; __u64 mask; @@ -286,8 +321,9 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) if (!pid_in_current_pidns(pid)) return -ESRCH; + attr = READ_ONCE(pid->attr); if (mask & PIDFD_INFO_EXIT) { - exit_info = READ_ONCE(pidfs_i(inode)->exit_info); + exit_info = READ_ONCE(attr->exit_info); if (exit_info) { kinfo.mask |= PIDFD_INFO_EXIT; #ifdef CONFIG_CGROUPS @@ -300,7 +336,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) if (mask & PIDFD_INFO_COREDUMP) { kinfo.mask |= PIDFD_INFO_COREDUMP; - kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask); + kinfo.coredump_mask = READ_ONCE(attr->__pei.coredump_mask); } task = get_pid_task(pid, PIDTYPE_PID); @@ -319,7 +355,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) if (!c) return -ESRCH; - if (!(kinfo.mask & PIDFD_INFO_COREDUMP)) { + if ((kinfo.mask & PIDFD_INFO_COREDUMP) && !(kinfo.coredump_mask)) { task_lock(task); if (task->mm) kinfo.coredump_mask = pidfs_coredump_mask(task->mm->flags); @@ -366,7 +402,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) kinfo.pid = task_pid_vnr(task); kinfo.mask |= PIDFD_INFO_PID; - if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1)) + if (kinfo.pid == 0 || kinfo.tgid == 0) return -ESRCH; copy_out: @@ -552,41 +588,61 @@ struct pid *pidfd_pid(const struct file *file) * task has been reaped which cannot happen until we're out of * release_task(). * - * If this struct pid is referred to by a pidfd then - * stashed_dentry_get() will return the dentry and inode for that struct - * pid. Since we've taken a reference on it there's now an additional - * reference from the exit path on it. Which is fine. We're going to put - * it again in a second and we know that the pid is kept alive anyway. + * If this struct pid has at least once been referred to by a pidfd then + * pid->attr will be allocated. If not we mark the struct pid as dead so + * anyone who is trying to register it with pidfs will fail to do so. + * Otherwise we would hand out pidfs for reaped tasks without having + * exit information available. * - * Worst case is that we've filled in the info and immediately free the - * dentry and inode afterwards since the pidfd has been closed. Since + * Worst case is that we've filled in the info and the pid gets freed + * right away in free_pid() when no one holds a pidfd anymore. Since * pidfs_exit() currently is placed after exit_task_work() we know that - * it cannot be us aka the exiting task holding a pidfd to ourselves. + * it cannot be us aka the exiting task holding a pidfd to itself. */ void pidfs_exit(struct task_struct *tsk) { - struct dentry *dentry; + struct pid *pid = task_pid(tsk); + struct pidfs_attr *attr; + struct pidfs_exit_info *exit_info; +#ifdef CONFIG_CGROUPS + struct cgroup *cgrp; +#endif might_sleep(); - dentry = stashed_dentry_get(&task_pid(tsk)->stashed); - if (dentry) { - struct inode *inode = d_inode(dentry); - struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei; -#ifdef CONFIG_CGROUPS - struct cgroup *cgrp; + guard(spinlock_irq)(&pid->wait_pidfd.lock); + attr = pid->attr; + if (!attr) { + /* + * No one ever held a pidfd for this struct pid. + * Mark it as dead so no one can add a pidfs + * entry anymore. We're about to be reaped and + * so no exit information would be available. + */ + pid->attr = PIDFS_PID_DEAD; + return; + } - rcu_read_lock(); - cgrp = task_dfl_cgroup(tsk); - exit_info->cgroupid = cgroup_id(cgrp); - rcu_read_unlock(); + /* + * If @pid->attr is set someone might still legitimately hold a + * pidfd to @pid or someone might concurrently still be getting + * a reference to an already stashed dentry from @pid->stashed. + * So defer cleaning @pid->attr until the last reference to @pid + * is put + */ + + exit_info = &attr->__pei; + +#ifdef CONFIG_CGROUPS + rcu_read_lock(); + cgrp = task_dfl_cgroup(tsk); + exit_info->cgroupid = cgroup_id(cgrp); + rcu_read_unlock(); #endif - exit_info->exit_code = tsk->exit_code; + exit_info->exit_code = tsk->exit_code; - /* Ensure that PIDFD_GET_INFO sees either all or nothing. */ - smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei); - dput(dentry); - } + /* Ensure that PIDFD_GET_INFO sees either all or nothing. */ + smp_store_release(&attr->exit_info, &attr->__pei); } #ifdef CONFIG_COREDUMP @@ -594,16 +650,15 @@ void pidfs_coredump(const struct coredump_params *cprm) { struct pid *pid = cprm->pid; struct pidfs_exit_info *exit_info; - struct dentry *dentry; - struct inode *inode; + struct pidfs_attr *attr; __u32 coredump_mask = 0; - dentry = pid->stashed; - if (WARN_ON_ONCE(!dentry)) - return; + attr = READ_ONCE(pid->attr); - inode = d_inode(dentry); - exit_info = &pidfs_i(inode)->__pei; + VFS_WARN_ON_ONCE(!attr); + VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD); + + exit_info = &attr->__pei; /* Note how we were coredumped. */ coredump_mask = pidfs_coredump_mask(cprm->mm_flags); /* Note that we actually did coredump. */ @@ -634,9 +689,24 @@ static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, return anon_inode_getattr(idmap, path, stat, request_mask, query_flags); } +static ssize_t pidfs_listxattr(struct dentry *dentry, char *buf, size_t size) +{ + struct inode *inode = d_inode(dentry); + struct pid *pid = inode->i_private; + struct pidfs_attr *attr = pid->attr; + struct simple_xattrs *xattrs; + + xattrs = READ_ONCE(attr->xattrs); + if (!xattrs) + return 0; + + return simple_xattr_list(inode, xattrs, buf, size); +} + static const struct inode_operations pidfs_inode_operations = { - .getattr = pidfs_getattr, - .setattr = pidfs_setattr, + .getattr = pidfs_getattr, + .setattr = pidfs_setattr, + .listxattr = pidfs_listxattr, }; static void pidfs_evict_inode(struct inode *inode) @@ -647,30 +717,9 @@ static void pidfs_evict_inode(struct inode *inode) put_pid(pid); } -static struct inode *pidfs_alloc_inode(struct super_block *sb) -{ - struct pidfs_inode *pi; - - pi = alloc_inode_sb(sb, pidfs_cachep, GFP_KERNEL); - if (!pi) - return NULL; - - memset(&pi->__pei, 0, sizeof(pi->__pei)); - pi->exit_info = NULL; - - return &pi->vfs_inode; -} - -static void pidfs_free_inode(struct inode *inode) -{ - kmem_cache_free(pidfs_cachep, pidfs_i(inode)); -} - static const struct super_operations pidfs_sops = { - .alloc_inode = pidfs_alloc_inode, .drop_inode = generic_delete_inode, .evict_inode = pidfs_evict_inode, - .free_inode = pidfs_free_inode, .statfs = simple_statfs, }; @@ -770,6 +819,8 @@ static struct dentry *pidfs_fh_to_dentry(struct super_block *sb, if (ret < 0) return ERR_PTR(ret); + VFS_WARN_ON_ONCE(!pid->attr); + mntput(path.mnt); return path.dentry; } @@ -796,53 +847,8 @@ static int pidfs_export_permission(struct handle_to_path_ctx *ctx, return 0; } -static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path, - unsigned int flags) -{ - enum pid_type type; - - if (flags & PIDFD_STALE) - return true; - - /* - * Make sure that if a pidfd is created PIDFD_INFO_EXIT - * information will be available. So after an inode for the - * pidfd has been allocated perform another check that the pid - * is still alive. If it is exit information is available even - * if the task gets reaped before the pidfd is returned to - * userspace. The only exception are indicated by PIDFD_STALE: - * - * (1) The kernel is in the middle of task creation and thus no - * task linkage has been established yet. - * (2) The caller knows @pid has been registered in pidfs at a - * time when the task was still alive. - * - * In both cases exit information will have been reported. - */ - if (flags & PIDFD_THREAD) - type = PIDTYPE_PID; - else - type = PIDTYPE_TGID; - - /* - * Since pidfs_exit() is called before struct pid's task linkage - * is removed the case where the task got reaped but a dentry - * was already attached to struct pid and exit information was - * recorded and published can be handled correctly. - */ - if (unlikely(!pid_has_task(pid, type))) { - struct inode *inode = d_inode(path->dentry); - return !!READ_ONCE(pidfs_i(inode)->exit_info); - } - - return true; -} - static struct file *pidfs_export_open(struct path *path, unsigned int oflags) { - if (!pidfs_pid_valid(d_inode(path->dentry)->i_private, path, oflags)) - return ERR_PTR(-ESRCH); - /* * Clear O_LARGEFILE as open_by_handle_at() forces it and raise * O_RDWR as pidfds always are. @@ -864,6 +870,8 @@ static int pidfs_init_inode(struct inode *inode, void *data) inode->i_private = data; inode->i_flags |= S_PRIVATE | S_ANON_INODE; + /* We allow to set xattrs. */ + inode->i_flags &= ~S_IMMUTABLE; inode->i_mode |= S_IRWXU; inode->i_op = &pidfs_inode_operations; inode->i_fop = &pidfs_file_operations; @@ -878,9 +886,127 @@ static void pidfs_put_data(void *data) put_pid(pid); } +/** + * pidfs_register_pid - register a struct pid in pidfs + * @pid: pid to pin + * + * Register a struct pid in pidfs. + * + * Return: On success zero, on error a negative error code is returned. + */ +int pidfs_register_pid(struct pid *pid) +{ + struct pidfs_attr *new_attr __free(kfree) = NULL; + struct pidfs_attr *attr; + + might_sleep(); + + if (!pid) + return 0; + + attr = READ_ONCE(pid->attr); + if (unlikely(attr == PIDFS_PID_DEAD)) + return PTR_ERR(PIDFS_PID_DEAD); + if (attr) + return 0; + + new_attr = kmem_cache_zalloc(pidfs_attr_cachep, GFP_KERNEL); + if (!new_attr) + return -ENOMEM; + + /* Synchronize with pidfs_exit(). */ + guard(spinlock_irq)(&pid->wait_pidfd.lock); + + attr = pid->attr; + if (unlikely(attr == PIDFS_PID_DEAD)) + return PTR_ERR(PIDFS_PID_DEAD); + if (unlikely(attr)) + return 0; + + pid->attr = no_free_ptr(new_attr); + return 0; +} + +static struct dentry *pidfs_stash_dentry(struct dentry **stashed, + struct dentry *dentry) +{ + int ret; + struct pid *pid = d_inode(dentry)->i_private; + + VFS_WARN_ON_ONCE(stashed != &pid->stashed); + + ret = pidfs_register_pid(pid); + if (ret) + return ERR_PTR(ret); + + return stash_dentry(stashed, dentry); +} + static const struct stashed_operations pidfs_stashed_ops = { - .init_inode = pidfs_init_inode, - .put_data = pidfs_put_data, + .stash_dentry = pidfs_stash_dentry, + .init_inode = pidfs_init_inode, + .put_data = pidfs_put_data, +}; + +static int pidfs_xattr_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *suffix, void *value, size_t size) +{ + struct pid *pid = inode->i_private; + struct pidfs_attr *attr = pid->attr; + const char *name; + struct simple_xattrs *xattrs; + + xattrs = READ_ONCE(attr->xattrs); + if (!xattrs) + return 0; + + name = xattr_full_name(handler, suffix); + return simple_xattr_get(xattrs, name, value, size); +} + +static int pidfs_xattr_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, struct dentry *unused, + struct inode *inode, const char *suffix, + const void *value, size_t size, int flags) +{ + struct pid *pid = inode->i_private; + struct pidfs_attr *attr = pid->attr; + const char *name; + struct simple_xattrs *xattrs; + struct simple_xattr *old_xattr; + + /* Ensure we're the only one to set @attr->xattrs. */ + WARN_ON_ONCE(!inode_is_locked(inode)); + + xattrs = READ_ONCE(attr->xattrs); + if (!xattrs) { + xattrs = kmem_cache_zalloc(pidfs_xattr_cachep, GFP_KERNEL); + if (!xattrs) + return -ENOMEM; + + simple_xattrs_init(xattrs); + smp_store_release(&pid->attr->xattrs, xattrs); + } + + name = xattr_full_name(handler, suffix); + old_xattr = simple_xattr_set(xattrs, name, value, size, flags); + if (IS_ERR(old_xattr)) + return PTR_ERR(old_xattr); + + simple_xattr_free(old_xattr); + return 0; +} + +static const struct xattr_handler pidfs_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = pidfs_xattr_get, + .set = pidfs_xattr_set, +}; + +static const struct xattr_handler *const pidfs_xattr_handlers[] = { + &pidfs_trusted_xattr_handler, + NULL }; static int pidfs_init_fs_context(struct fs_context *fc) @@ -891,9 +1017,12 @@ static int pidfs_init_fs_context(struct fs_context *fc) if (!ctx) return -ENOMEM; + fc->s_iflags |= SB_I_NOEXEC; + fc->s_iflags |= SB_I_NODEV; ctx->ops = &pidfs_sops; ctx->eops = &pidfs_export_operations; ctx->dops = &pidfs_dentry_operations; + ctx->xattr = pidfs_xattr_handlers; fc->s_fs_info = (void *)&pidfs_stashed_ops; return 0; } @@ -921,8 +1050,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) if (ret < 0) return ERR_PTR(ret); - if (!pidfs_pid_valid(pid, &path, flags)) - return ERR_PTR(-ESRCH); + VFS_WARN_ON_ONCE(!pid->attr); flags &= ~PIDFD_STALE; flags |= O_RDWR; @@ -934,79 +1062,21 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) return pidfd_file; } -/** - * pidfs_register_pid - register a struct pid in pidfs - * @pid: pid to pin - * - * Register a struct pid in pidfs. Needs to be paired with - * pidfs_put_pid() to not risk leaking the pidfs dentry and inode. - * - * Return: On success zero, on error a negative error code is returned. - */ -int pidfs_register_pid(struct pid *pid) -{ - struct path path __free(path_put) = {}; - int ret; - - might_sleep(); - - if (!pid) - return 0; - - ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); - if (unlikely(ret)) - return ret; - /* Keep the dentry and only put the reference to the mount. */ - path.dentry = NULL; - return 0; -} - -/** - * pidfs_get_pid - pin a struct pid through pidfs - * @pid: pid to pin - * - * Similar to pidfs_register_pid() but only valid if the caller knows - * there's a reference to the @pid through a dentry already that can't - * go away. - */ -void pidfs_get_pid(struct pid *pid) -{ - if (!pid) - return; - WARN_ON_ONCE(!stashed_dentry_get(&pid->stashed)); -} - -/** - * pidfs_put_pid - drop a pidfs reference - * @pid: pid to drop - * - * Drop a reference to @pid via pidfs. This is only safe if the - * reference has been taken via pidfs_get_pid(). - */ -void pidfs_put_pid(struct pid *pid) -{ - might_sleep(); - - if (!pid) - return; - VFS_WARN_ON_ONCE(!pid->stashed); - dput(pid->stashed); -} - -static void pidfs_inode_init_once(void *data) -{ - struct pidfs_inode *pi = data; - - inode_init_once(&pi->vfs_inode); -} - void __init pidfs_init(void) { - pidfs_cachep = kmem_cache_create("pidfs_cache", sizeof(struct pidfs_inode), 0, + pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0, (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | - SLAB_ACCOUNT | SLAB_PANIC), - pidfs_inode_init_once); + SLAB_ACCOUNT | SLAB_PANIC), NULL); + + pidfs_xattr_cachep = kmem_cache_create("pidfs_xattr_cache", + sizeof(struct simple_xattrs), 0, + (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | + SLAB_ACCOUNT | SLAB_PANIC), NULL); + pidfs_mnt = kern_mount(&pidfs_type); if (IS_ERR(pidfs_mnt)) panic("Failed to mount pidfs pseudo filesystem"); + + pidfs_root_path.mnt = pidfs_mnt; + pidfs_root_path.dentry = pidfs_mnt->mnt_root; } diff --git a/fs/pipe.c b/fs/pipe.c index da45edd68c41..731622d0738d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -26,6 +26,7 @@ #include <linux/memcontrol.h> #include <linux/watch_queue.h> #include <linux/sysctl.h> +#include <linux/sort.h> #include <linux/uaccess.h> #include <asm/ioctls.h> @@ -76,8 +77,6 @@ static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 */ -#define cmp_int(l, r) ((l > r) - (l < r)) - #ifdef CONFIG_PROVE_LOCKING static int pipe_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) @@ -964,6 +963,11 @@ int create_pipe_files(struct file **res, int flags) res[1] = f; stream_open(inode, res[0]); stream_open(inode, res[1]); + + /* pipe groks IOCB_NOWAIT */ + res[0]->f_mode |= FMODE_NOWAIT; + res[1]->f_mode |= FMODE_NOWAIT; + /* * Disable permission and pre-content events, but enable legacy * inotify events for legacy users. @@ -998,9 +1002,6 @@ static int __do_pipe_flags(int *fd, struct file **files, int flags) audit_fd_pair(fdr, fdw); fd[0] = fdr; fd[1] = fdw; - /* pipe groks IOCB_NOWAIT */ - files[0]->f_mode |= FMODE_NOWAIT; - files[1]->f_mode |= FMODE_NOWAIT; return 0; err_fdr: diff --git a/fs/pnode.c b/fs/pnode.c index ffd429b760d5..81f7599bdac4 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -21,17 +21,12 @@ static inline struct mount *next_peer(struct mount *p) static inline struct mount *first_slave(struct mount *p) { - return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave); -} - -static inline struct mount *last_slave(struct mount *p) -{ - return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave); + return hlist_entry(p->mnt_slave_list.first, struct mount, mnt_slave); } static inline struct mount *next_slave(struct mount *p) { - return list_entry(p->mnt_slave.next, struct mount, mnt_slave); + return hlist_entry(p->mnt_slave.next, struct mount, mnt_slave); } static struct mount *get_peer_under_root(struct mount *mnt, @@ -70,69 +65,90 @@ int get_dominating_id(struct mount *mnt, const struct path *root) return 0; } -static int do_make_slave(struct mount *mnt) +static inline bool will_be_unmounted(struct mount *m) { - struct mount *master, *slave_mnt; + return m->mnt.mnt_flags & MNT_UMOUNT; +} - if (list_empty(&mnt->mnt_share)) { - if (IS_MNT_SHARED(mnt)) { - mnt_release_group_id(mnt); - CLEAR_MNT_SHARED(mnt); - } - master = mnt->mnt_master; - if (!master) { - struct list_head *p = &mnt->mnt_slave_list; - while (!list_empty(p)) { - slave_mnt = list_first_entry(p, - struct mount, mnt_slave); - list_del_init(&slave_mnt->mnt_slave); - slave_mnt->mnt_master = NULL; - } - return 0; - } - } else { +static struct mount *propagation_source(struct mount *mnt) +{ + do { struct mount *m; - /* - * slave 'mnt' to a peer mount that has the - * same root dentry. If none is available then - * slave it to anything that is available. - */ - for (m = master = next_peer(mnt); m != mnt; m = next_peer(m)) { - if (m->mnt.mnt_root == mnt->mnt.mnt_root) { - master = m; - break; - } + for (m = next_peer(mnt); m != mnt; m = next_peer(m)) { + if (!will_be_unmounted(m)) + return m; } - list_del_init(&mnt->mnt_share); - mnt->mnt_group_id = 0; - CLEAR_MNT_SHARED(mnt); + mnt = mnt->mnt_master; + } while (mnt && will_be_unmounted(mnt)); + return mnt; +} + +static void transfer_propagation(struct mount *mnt, struct mount *to) +{ + struct hlist_node *p = NULL, *n; + struct mount *m; + + hlist_for_each_entry_safe(m, n, &mnt->mnt_slave_list, mnt_slave) { + m->mnt_master = to; + if (!to) + hlist_del_init(&m->mnt_slave); + else + p = &m->mnt_slave; } - list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave) - slave_mnt->mnt_master = master; - list_move(&mnt->mnt_slave, &master->mnt_slave_list); - list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev); - INIT_LIST_HEAD(&mnt->mnt_slave_list); - mnt->mnt_master = master; - return 0; + if (p) + hlist_splice_init(&mnt->mnt_slave_list, p, &to->mnt_slave_list); } /* - * vfsmount lock must be held for write + * EXCL[namespace_sem] */ void change_mnt_propagation(struct mount *mnt, int type) { + struct mount *m = mnt->mnt_master; + if (type == MS_SHARED) { set_mnt_shared(mnt); return; } - do_make_slave(mnt); - if (type != MS_SLAVE) { - list_del_init(&mnt->mnt_slave); + if (IS_MNT_SHARED(mnt)) { + m = propagation_source(mnt); + if (list_empty(&mnt->mnt_share)) { + mnt_release_group_id(mnt); + } else { + list_del_init(&mnt->mnt_share); + mnt->mnt_group_id = 0; + } + CLEAR_MNT_SHARED(mnt); + transfer_propagation(mnt, m); + } + hlist_del_init(&mnt->mnt_slave); + if (type == MS_SLAVE) { + mnt->mnt_master = m; + if (m) + hlist_add_head(&mnt->mnt_slave, &m->mnt_slave_list); + } else { mnt->mnt_master = NULL; if (type == MS_UNBINDABLE) - mnt->mnt.mnt_flags |= MNT_UNBINDABLE; + mnt->mnt_t_flags |= T_UNBINDABLE; else - mnt->mnt.mnt_flags &= ~MNT_UNBINDABLE; + mnt->mnt_t_flags &= ~T_UNBINDABLE; + } +} + +static struct mount *__propagation_next(struct mount *m, + struct mount *origin) +{ + while (1) { + struct mount *master = m->mnt_master; + + if (master == origin->mnt_master) { + struct mount *next = next_peer(m); + return (next == origin) ? NULL : next; + } else if (m->mnt_slave.next) + return next_slave(m); + + /* back at master */ + m = master; } } @@ -150,34 +166,24 @@ static struct mount *propagation_next(struct mount *m, struct mount *origin) { /* are there any slaves of this mount? */ - if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + if (!IS_MNT_NEW(m) && !hlist_empty(&m->mnt_slave_list)) return first_slave(m); - while (1) { - struct mount *master = m->mnt_master; - - if (master == origin->mnt_master) { - struct mount *next = next_peer(m); - return (next == origin) ? NULL : next; - } else if (m->mnt_slave.next != &master->mnt_slave_list) - return next_slave(m); - - /* back at master */ - m = master; - } + return __propagation_next(m, origin); } static struct mount *skip_propagation_subtree(struct mount *m, struct mount *origin) { /* - * Advance m such that propagation_next will not return - * the slaves of m. + * Advance m past everything that gets propagation from it. */ - if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) - m = last_slave(m); + struct mount *p = __propagation_next(m, origin); + + while (p && peers(m, p)) + p = __propagation_next(p, origin); - return m; + return p; } static struct mount *next_group(struct mount *m, struct mount *origin) @@ -185,7 +191,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin) while (1) { while (1) { struct mount *next; - if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + if (!IS_MNT_NEW(m) && !hlist_empty(&m->mnt_slave_list)) return first_slave(m); next = next_peer(m); if (m->mnt_group_id == origin->mnt_group_id) { @@ -198,7 +204,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin) /* m is the last peer */ while (1) { struct mount *master = m->mnt_master; - if (m->mnt_slave.next != &master->mnt_slave_list) + if (m->mnt_slave.next) return next_slave(m); m = next_peer(master); if (master->mnt_group_id == origin->mnt_group_id) @@ -212,142 +218,113 @@ static struct mount *next_group(struct mount *m, struct mount *origin) } } -/* all accesses are serialized by namespace_sem */ -static struct mount *last_dest, *first_source, *last_source, *dest_master; -static struct hlist_head *list; - -static inline bool peers(const struct mount *m1, const struct mount *m2) -{ - return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id; -} - -static int propagate_one(struct mount *m, struct mountpoint *dest_mp) +static bool need_secondary(struct mount *m, struct mountpoint *dest_mp) { - struct mount *child; - int type; /* skip ones added by this propagate_mnt() */ if (IS_MNT_NEW(m)) - return 0; + return false; /* skip if mountpoint isn't visible in m */ if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) - return 0; + return false; /* skip if m is in the anon_ns */ if (is_anon_ns(m->mnt_ns)) - return 0; + return false; + return true; +} - if (peers(m, last_dest)) { - type = CL_MAKE_SHARED; - } else { - struct mount *n, *p; - bool done; - for (n = m; ; n = p) { - p = n->mnt_master; - if (p == dest_master || IS_MNT_MARKED(p)) - break; +static struct mount *find_master(struct mount *m, + struct mount *last_copy, + struct mount *original) +{ + struct mount *p; + + // ascend until there's a copy for something with the same master + for (;;) { + p = m->mnt_master; + if (!p || IS_MNT_MARKED(p)) + break; + m = p; + } + while (!peers(last_copy, original)) { + struct mount *parent = last_copy->mnt_parent; + if (parent->mnt_master == p) { + if (!peers(parent, m)) + last_copy = last_copy->mnt_master; + break; } - do { - struct mount *parent = last_source->mnt_parent; - if (peers(last_source, first_source)) - break; - done = parent->mnt_master == p; - if (done && peers(n, parent)) - break; - last_source = last_source->mnt_master; - } while (!done); - - type = CL_SLAVE; - /* beginning of peer group among the slaves? */ - if (IS_MNT_SHARED(m)) - type |= CL_MAKE_SHARED; + last_copy = last_copy->mnt_master; } - - child = copy_tree(last_source, last_source->mnt.mnt_root, type); - if (IS_ERR(child)) - return PTR_ERR(child); - read_seqlock_excl(&mount_lock); - mnt_set_mountpoint(m, dest_mp, child); - if (m->mnt_master != dest_master) - SET_MNT_MARK(m->mnt_master); - read_sequnlock_excl(&mount_lock); - last_dest = m; - last_source = child; - hlist_add_head(&child->mnt_hash, list); - return count_mounts(m->mnt_ns, child); + return last_copy; } -/* - * mount 'source_mnt' under the destination 'dest_mnt' at - * dentry 'dest_dentry'. And propagate that mount to - * all the peer and slave mounts of 'dest_mnt'. - * Link all the new mounts into a propagation tree headed at - * source_mnt. Also link all the new mounts using ->mnt_list - * headed at source_mnt's ->mnt_list +/** + * propagate_mnt() - create secondary copies for tree attachment + * @dest_mnt: destination mount. + * @dest_mp: destination mountpoint. + * @source_mnt: source mount. + * @tree_list: list of secondaries to be attached. * - * @dest_mnt: destination mount. - * @dest_dentry: destination dentry. - * @source_mnt: source mount. - * @tree_list : list of heads of trees to be attached. + * Create secondary copies for attaching a tree with root @source_mnt + * at mount @dest_mnt with mountpoint @dest_mp. Link all new mounts + * into a propagation graph. Set mountpoints for all secondaries, + * link their roots into @tree_list via ->mnt_hash. */ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, - struct mount *source_mnt, struct hlist_head *tree_list) + struct mount *source_mnt, struct hlist_head *tree_list) { - struct mount *m, *n; - int ret = 0; - - /* - * we don't want to bother passing tons of arguments to - * propagate_one(); everything is serialized by namespace_sem, - * so globals will do just fine. - */ - last_dest = dest_mnt; - first_source = source_mnt; - last_source = source_mnt; - list = tree_list; - dest_master = dest_mnt->mnt_master; - - /* all peers of dest_mnt, except dest_mnt itself */ - for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) { - ret = propagate_one(n, dest_mp); - if (ret) - goto out; - } - - /* all slave groups */ - for (m = next_group(dest_mnt, dest_mnt); m; - m = next_group(m, dest_mnt)) { - /* everything in that slave group */ - n = m; + struct mount *m, *n, *copy, *this; + int err = 0, type; + + if (dest_mnt->mnt_master) + SET_MNT_MARK(dest_mnt->mnt_master); + + /* iterate over peer groups, depth first */ + for (m = dest_mnt; m && !err; m = next_group(m, dest_mnt)) { + if (m == dest_mnt) { // have one for dest_mnt itself + copy = source_mnt; + type = CL_MAKE_SHARED; + n = next_peer(m); + if (n == m) + continue; + } else { + type = CL_SLAVE; + /* beginning of peer group among the slaves? */ + if (IS_MNT_SHARED(m)) + type |= CL_MAKE_SHARED; + n = m; + } do { - ret = propagate_one(n, dest_mp); - if (ret) - goto out; - n = next_peer(n); - } while (n != m); + if (!need_secondary(n, dest_mp)) + continue; + if (type & CL_SLAVE) // first in this peer group + copy = find_master(n, copy, source_mnt); + this = copy_tree(copy, copy->mnt.mnt_root, type); + if (IS_ERR(this)) { + err = PTR_ERR(this); + break; + } + read_seqlock_excl(&mount_lock); + mnt_set_mountpoint(n, dest_mp, this); + read_sequnlock_excl(&mount_lock); + if (n->mnt_master) + SET_MNT_MARK(n->mnt_master); + copy = this; + hlist_add_head(&this->mnt_hash, tree_list); + err = count_mounts(n->mnt_ns, this); + if (err) + break; + type = CL_MAKE_SHARED; + } while ((n = next_peer(n)) != m); } -out: - read_seqlock_excl(&mount_lock); + hlist_for_each_entry(n, tree_list, mnt_hash) { m = n->mnt_parent; - if (m->mnt_master != dest_mnt->mnt_master) + if (m->mnt_master) CLEAR_MNT_MARK(m->mnt_master); } - read_sequnlock_excl(&mount_lock); - return ret; -} - -static struct mount *find_topper(struct mount *mnt) -{ - /* If there is exactly one mount covering mnt completely return it. */ - struct mount *child; - - if (!list_is_singular(&mnt->mnt_mounts)) - return NULL; - - child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child); - if (child->mnt_mountpoint != mnt->mnt.mnt_root) - return NULL; - - return child; + if (dest_mnt->mnt_master) + CLEAR_MNT_MARK(dest_mnt->mnt_master); + return err; } /* @@ -407,12 +384,8 @@ bool propagation_would_overmount(const struct mount *from, */ int propagate_mount_busy(struct mount *mnt, int refcnt) { - struct mount *m, *child, *topper; struct mount *parent = mnt->mnt_parent; - if (mnt == parent) - return do_refcount_check(mnt, refcnt); - /* * quickly check if the current mount can be unmounted. * If not, we don't have to go checking for all other @@ -421,23 +394,27 @@ int propagate_mount_busy(struct mount *mnt, int refcnt) if (!list_empty(&mnt->mnt_mounts) || do_refcount_check(mnt, refcnt)) return 1; - for (m = propagation_next(parent, parent); m; + if (mnt == parent) + return 0; + + for (struct mount *m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - int count = 1; - child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); - if (!child) - continue; + struct list_head *head; + struct mount *child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); - /* Is there exactly one mount on the child that covers - * it completely whose reference should be ignored? - */ - topper = find_topper(child); - if (topper) - count += 1; - else if (!list_empty(&child->mnt_mounts)) + if (!child) continue; - if (do_refcount_check(child, count)) + head = &child->mnt_mounts; + if (!list_empty(head)) { + /* + * a mount that covers child completely wouldn't prevent + * it being pulled out; any other would. + */ + if (!list_is_singular(head) || !child->overmount) + continue; + } + if (do_refcount_check(child, 1)) return 1; } return 0; @@ -463,181 +440,209 @@ void propagate_mount_unlock(struct mount *mnt) } } -static void umount_one(struct mount *mnt, struct list_head *to_umount) +static inline bool is_candidate(struct mount *m) { - CLEAR_MNT_MARK(mnt); - mnt->mnt.mnt_flags |= MNT_UMOUNT; - list_del_init(&mnt->mnt_child); - list_del_init(&mnt->mnt_umounting); - move_from_ns(mnt, to_umount); + return m->mnt_t_flags & T_UMOUNT_CANDIDATE; } -/* - * NOTE: unmounting 'mnt' naturally propagates to all other mounts its - * parent propagates to. - */ -static bool __propagate_umount(struct mount *mnt, - struct list_head *to_umount, - struct list_head *to_restore) +static void umount_one(struct mount *m, struct list_head *to_umount) { - bool progress = false; - struct mount *child; - - /* - * The state of the parent won't change if this mount is - * already unmounted or marked as without children. - */ - if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED)) - goto out; + m->mnt.mnt_flags |= MNT_UMOUNT; + list_del_init(&m->mnt_child); + move_from_ns(m); + list_add_tail(&m->mnt_list, to_umount); +} - /* Verify topper is the only grandchild that has not been - * speculatively unmounted. - */ - list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { - if (child->mnt_mountpoint == mnt->mnt.mnt_root) - continue; - if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child)) - continue; - /* Found a mounted child */ - goto children; - } +static void remove_from_candidate_list(struct mount *m) +{ + m->mnt_t_flags &= ~(T_MARKED | T_UMOUNT_CANDIDATE); + list_del_init(&m->mnt_list); +} - /* Mark mounts that can be unmounted if not locked */ - SET_MNT_MARK(mnt); - progress = true; +static void gather_candidates(struct list_head *set, + struct list_head *candidates) +{ + struct mount *m, *p, *q; - /* If a mount is without children and not locked umount it. */ - if (!IS_MNT_LOCKED(mnt)) { - umount_one(mnt, to_umount); - } else { -children: - list_move_tail(&mnt->mnt_umounting, to_restore); + list_for_each_entry(m, set, mnt_list) { + if (is_candidate(m)) + continue; + m->mnt_t_flags |= T_UMOUNT_CANDIDATE; + p = m->mnt_parent; + q = propagation_next(p, p); + while (q) { + struct mount *child = __lookup_mnt(&q->mnt, + m->mnt_mountpoint); + if (child) { + /* + * We might've already run into this one. That + * must've happened on earlier iteration of the + * outer loop; in that case we can skip those + * parents that get propagation from q - there + * will be nothing new on those as well. + */ + if (is_candidate(child)) { + q = skip_propagation_subtree(q, p); + continue; + } + child->mnt_t_flags |= T_UMOUNT_CANDIDATE; + if (!will_be_unmounted(child)) + list_add(&child->mnt_list, candidates); + } + q = propagation_next(q, p); + } } -out: - return progress; + list_for_each_entry(m, set, mnt_list) + m->mnt_t_flags &= ~T_UMOUNT_CANDIDATE; } -static void umount_list(struct list_head *to_umount, - struct list_head *to_restore) +/* + * We know that some child of @m can't be unmounted. In all places where the + * chain of descent of @m has child not overmounting the root of parent, + * the parent can't be unmounted either. + */ +static void trim_ancestors(struct mount *m) { - struct mount *mnt, *child, *tmp; - list_for_each_entry(mnt, to_umount, mnt_list) { - list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) { - /* topper? */ - if (child->mnt_mountpoint == mnt->mnt.mnt_root) - list_move_tail(&child->mnt_umounting, to_restore); - else - umount_one(child, to_umount); - } + struct mount *p; + + for (p = m->mnt_parent; is_candidate(p); m = p, p = p->mnt_parent) { + if (IS_MNT_MARKED(m)) // all candidates beneath are overmounts + return; + SET_MNT_MARK(m); + if (m != p->overmount) + p->mnt_t_flags &= ~T_UMOUNT_CANDIDATE; } } -static void restore_mounts(struct list_head *to_restore) +/* + * Find and exclude all umount candidates forbidden by @m + * (see Documentation/filesystems/propagate_umount.txt) + * If we can immediately tell that @m is OK to unmount (unlocked + * and all children are already committed to unmounting) commit + * to unmounting it. + * Only @m itself might be taken from the candidates list; + * anything found by trim_ancestors() is marked non-candidate + * and left on the list. + */ +static void trim_one(struct mount *m, struct list_head *to_umount) { - /* Restore mounts to a clean working state */ - while (!list_empty(to_restore)) { - struct mount *mnt, *parent; - struct mountpoint *mp; - - mnt = list_first_entry(to_restore, struct mount, mnt_umounting); - CLEAR_MNT_MARK(mnt); - list_del_init(&mnt->mnt_umounting); - - /* Should this mount be reparented? */ - mp = mnt->mnt_mp; - parent = mnt->mnt_parent; - while (parent->mnt.mnt_flags & MNT_UMOUNT) { - mp = parent->mnt_mp; - parent = parent->mnt_parent; - } - if (parent != mnt->mnt_parent) { - mnt_change_mountpoint(parent, mp, mnt); - mnt_notify_add(mnt); + bool remove_this = false, found = false, umount_this = false; + struct mount *n; + + if (!is_candidate(m)) { // trim_ancestors() left it on list + remove_from_candidate_list(m); + return; + } + + list_for_each_entry(n, &m->mnt_mounts, mnt_child) { + if (!is_candidate(n)) { + found = true; + if (n != m->overmount) { + remove_this = true; + break; + } } } + if (found) { + trim_ancestors(m); + } else if (!IS_MNT_LOCKED(m) && list_empty(&m->mnt_mounts)) { + remove_this = true; + umount_this = true; + } + if (remove_this) { + remove_from_candidate_list(m); + if (umount_this) + umount_one(m, to_umount); + } } -static void cleanup_umount_visitations(struct list_head *visited) +static void handle_locked(struct mount *m, struct list_head *to_umount) { - while (!list_empty(visited)) { - struct mount *mnt = - list_first_entry(visited, struct mount, mnt_umounting); - list_del_init(&mnt->mnt_umounting); + struct mount *cutoff = m, *p; + + if (!is_candidate(m)) { // trim_ancestors() left it on list + remove_from_candidate_list(m); + return; + } + for (p = m; is_candidate(p); p = p->mnt_parent) { + remove_from_candidate_list(p); + if (!IS_MNT_LOCKED(p)) + cutoff = p->mnt_parent; + } + if (will_be_unmounted(p)) + cutoff = p; + while (m != cutoff) { + umount_one(m, to_umount); + m = m->mnt_parent; } } /* - * collect all mounts that receive propagation from the mount in @list, - * and return these additional mounts in the same list. - * @list: the list of mounts to be unmounted. + * @m is not to going away, and it overmounts the top of a stack of mounts + * that are going away. We know that all of those are fully overmounted + * by the one above (@m being the topmost of the chain), so @m can be slid + * in place where the bottom of the stack is attached. * - * vfsmount lock must be held for write + * NOTE: here we temporarily violate a constraint - two mounts end up with + * the same parent and mountpoint; that will be remedied as soon as we + * return from propagate_umount() - its caller (umount_tree()) will detach + * the stack from the parent it (and now @m) is attached to. umount_tree() + * might choose to keep unmounted pieces stuck to each other, but it always + * detaches them from the mounts that remain in the tree. */ -int propagate_umount(struct list_head *list) +static void reparent(struct mount *m) { - struct mount *mnt; - LIST_HEAD(to_restore); - LIST_HEAD(to_umount); - LIST_HEAD(visited); - - /* Find candidates for unmounting */ - list_for_each_entry_reverse(mnt, list, mnt_list) { - struct mount *parent = mnt->mnt_parent; - struct mount *m; + struct mount *p = m; + struct mountpoint *mp; - /* - * If this mount has already been visited it is known that it's - * entire peer group and all of their slaves in the propagation - * tree for the mountpoint has already been visited and there is - * no need to visit them again. - */ - if (!list_empty(&mnt->mnt_umounting)) - continue; + do { + mp = p->mnt_mp; + p = p->mnt_parent; + } while (will_be_unmounted(p)); - list_add_tail(&mnt->mnt_umounting, &visited); - for (m = propagation_next(parent, parent); m; - m = propagation_next(m, parent)) { - struct mount *child = __lookup_mnt(&m->mnt, - mnt->mnt_mountpoint); - if (!child) - continue; + mnt_change_mountpoint(p, mp, m); + mnt_notify_add(m); +} - if (!list_empty(&child->mnt_umounting)) { - /* - * If the child has already been visited it is - * know that it's entire peer group and all of - * their slaves in the propgation tree for the - * mountpoint has already been visited and there - * is no need to visit this subtree again. - */ - m = skip_propagation_subtree(m, parent); - continue; - } else if (child->mnt.mnt_flags & MNT_UMOUNT) { - /* - * We have come across a partially unmounted - * mount in a list that has not been visited - * yet. Remember it has been visited and - * continue about our merry way. - */ - list_add_tail(&child->mnt_umounting, &visited); - continue; - } +/** + * propagate_umount - apply propagation rules to the set of mounts for umount() + * @set: the list of mounts to be unmounted. + * + * Collect all mounts that receive propagation from the mount in @set and have + * no obstacles to being unmounted. Add these additional mounts to the set. + * + * See Documentation/filesystems/propagate_umount.txt if you do anything in + * this area. + * + * Locks held: + * mount_lock (write_seqlock), namespace_sem (exclusive). + */ +void propagate_umount(struct list_head *set) +{ + struct mount *m, *p; + LIST_HEAD(to_umount); // committed to unmounting + LIST_HEAD(candidates); // undecided umount candidates - /* Check the child and parents while progress is made */ - while (__propagate_umount(child, - &to_umount, &to_restore)) { - /* Is the parent a umount candidate? */ - child = child->mnt_parent; - if (list_empty(&child->mnt_umounting)) - break; - } - } + // collect all candidates + gather_candidates(set, &candidates); + + // reduce the set until it's non-shifting + list_for_each_entry_safe(m, p, &candidates, mnt_list) + trim_one(m, &to_umount); + + // ... and non-revealing + while (!list_empty(&candidates)) { + m = list_first_entry(&candidates,struct mount, mnt_list); + handle_locked(m, &to_umount); } - umount_list(&to_umount, &to_restore); - restore_mounts(&to_restore); - cleanup_umount_visitations(&visited); - list_splice_tail(&to_umount, list); + // now to_umount consists of all acceptable candidates + // deal with reparenting of remaining overmounts on those + list_for_each_entry(m, &to_umount, mnt_list) { + if (m->overmount) + reparent(m->overmount); + } - return 0; + // and fold them into the set + list_splice_tail_init(&to_umount, set); } diff --git a/fs/pnode.h b/fs/pnode.h index 34b6247af01d..00ab153e3e9d 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -10,14 +10,14 @@ #include <linux/list.h> #include "mount.h" -#define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED) +#define IS_MNT_SHARED(m) ((m)->mnt_t_flags & T_SHARED) #define IS_MNT_SLAVE(m) ((m)->mnt_master) #define IS_MNT_NEW(m) (!(m)->mnt_ns) -#define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED) -#define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE) -#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED) -#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED) -#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED) +#define CLEAR_MNT_SHARED(m) ((m)->mnt_t_flags &= ~T_SHARED) +#define IS_MNT_UNBINDABLE(m) ((m)->mnt_t_flags & T_UNBINDABLE) +#define IS_MNT_MARKED(m) ((m)->mnt_t_flags & T_MARKED) +#define SET_MNT_MARK(m) ((m)->mnt_t_flags |= T_MARKED) +#define CLEAR_MNT_MARK(m) ((m)->mnt_t_flags &= ~T_MARKED) #define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED) #define CL_EXPIRE 0x01 @@ -25,21 +25,26 @@ #define CL_COPY_UNBINDABLE 0x04 #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 -#define CL_SHARED_TO_SLAVE 0x20 #define CL_COPY_MNT_NS_FILE 0x40 -#define CL_COPY_ALL (CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE) - +/* + * EXCL[namespace_sem] + */ static inline void set_mnt_shared(struct mount *mnt) { - mnt->mnt.mnt_flags &= ~MNT_SHARED_MASK; - mnt->mnt.mnt_flags |= MNT_SHARED; + mnt->mnt_t_flags &= ~T_SHARED_MASK; + mnt->mnt_t_flags |= T_SHARED; +} + +static inline bool peers(const struct mount *m1, const struct mount *m2) +{ + return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id; } void change_mnt_propagation(struct mount *, int); int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, struct hlist_head *); -int propagate_umount(struct list_head *); +void propagate_umount(struct list_head *); int propagate_mount_busy(struct mount *, int); void propagate_mount_unlock(struct mount *); void mnt_release_group_id(struct mount *); diff --git a/fs/proc/base.c b/fs/proc/base.c index fe33a5843fbd..62d35631ba8c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -827,7 +827,13 @@ static const struct file_operations proc_single_file_operations = { .release = single_release, }; - +/* + * proc_mem_open() can return errno, NULL or mm_struct*. + * + * - Returns NULL if the task has no mm (PF_KTHREAD or PF_EXITING) + * - Returns mm_struct* on success + * - Returns error code on failure + */ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) { struct task_struct *task = get_proc_task(inode); @@ -854,8 +860,8 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) { struct mm_struct *mm = proc_mem_open(inode, mode); - if (IS_ERR(mm)) - return PTR_ERR(mm); + if (IS_ERR_OR_NULL(mm)) + return mm ? PTR_ERR(mm) : -ESRCH; file->private_data = mm; return 0; @@ -2698,8 +2704,7 @@ static struct dentry *proc_pident_instantiate(struct dentry *dentry, inode->i_fop = p->fop; ei->op = p->op; pid_update_inode(task, inode); - d_set_d_op(dentry, &pid_dentry_operations); - return d_splice_alias(inode, dentry); + return d_splice_alias_ops(inode, dentry, &pid_dentry_operations); } static struct dentry *proc_pident_lookup(struct inode *dir, @@ -3285,7 +3290,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, } #endif /* CONFIG_KSM */ -#ifdef CONFIG_STACKLEAK_METRICS +#ifdef CONFIG_KSTACK_ERASE_METRICS static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -3298,7 +3303,7 @@ static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns, prev_depth, depth); return 0; } -#endif /* CONFIG_STACKLEAK_METRICS */ +#endif /* CONFIG_KSTACK_ERASE_METRICS */ /* * Thread groups @@ -3405,7 +3410,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_LIVEPATCH ONE("patch_state", S_IRUSR, proc_pid_patch_state), #endif -#ifdef CONFIG_STACKLEAK_METRICS +#ifdef CONFIG_KSTACK_ERASE_METRICS ONE("stack_depth", S_IRUGO, proc_stack_depth), #endif #ifdef CONFIG_PROC_PID_ARCH_STATUS @@ -3495,8 +3500,7 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry, set_nlink(inode, nlink_tgid); pid_update_inode(task, inode); - d_set_d_op(dentry, &pid_dentry_operations); - return d_splice_alias(inode, dentry); + return d_splice_alias_ops(inode, dentry, &pid_dentry_operations); } struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags) @@ -3798,8 +3802,7 @@ static struct dentry *proc_task_instantiate(struct dentry *dentry, set_nlink(inode, nlink_tid); pid_update_inode(task, inode); - d_set_d_op(dentry, &pid_dentry_operations); - return d_splice_alias(inode, dentry); + return d_splice_alias_ops(inode, dentry, &pid_dentry_operations); } static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 37aa778d1af7..9eeccff49b2a 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -352,18 +352,9 @@ static int proc_fd_getattr(struct mnt_idmap *idmap, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - int rv = 0; generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); - - /* If it's a directory, put the number of open fds there */ - if (S_ISDIR(inode->i_mode)) { - rv = proc_readfd_count(inode, &stat->size); - if (rv < 0) - return rv; - } - - return rv; + return proc_readfd_count(inode, &stat->size); } const struct inode_operations proc_fd_inode_operations = { diff --git a/fs/proc/generic.c b/fs/proc/generic.c index a3e22803cddf..76e800e38c8f 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -254,8 +254,11 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry, inode = proc_get_inode(dir->i_sb, de); if (!inode) return ERR_PTR(-ENOMEM); - d_set_d_op(dentry, de->proc_dops); - return d_splice_alias(inode, dentry); + if (de->flags & PROC_ENTRY_FORCE_LOOKUP) + return d_splice_alias_ops(inode, dentry, + &proc_net_dentry_ops); + return d_splice_alias_ops(inode, dentry, + &proc_misc_dentry_ops); } read_unlock(&proc_subdir_lock); return ERR_PTR(-ENOENT); @@ -448,9 +451,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, INIT_LIST_HEAD(&ent->pde_openers); proc_set_user(ent, (*parent)->uid, (*parent)->gid); - ent->proc_dops = &proc_misc_dentry_ops; /* Revalidate everything under /proc/${pid}/net */ - if ((*parent)->proc_dops == &proc_net_dentry_ops) + if ((*parent)->flags & PROC_ENTRY_FORCE_LOOKUP) pde_force_lookup(ent); out: @@ -569,6 +571,8 @@ static void pde_set_flags(struct proc_dir_entry *pde) if (pde->proc_ops->proc_compat_ioctl) pde->flags |= PROC_ENTRY_proc_compat_ioctl; #endif + if (pde->proc_ops->proc_lseek) + pde->flags |= PROC_ENTRY_proc_lseek; } struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, diff --git a/fs/proc/inode.c b/fs/proc/inode.c index a3eb3b740f76..129490151be1 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -42,7 +42,7 @@ static void proc_evict_inode(struct inode *inode) head = ei->sysctl; if (head) { - RCU_INIT_POINTER(ei->sysctl, NULL); + WRITE_ONCE(ei->sysctl, NULL); proc_sys_evict_inode(inode, head); } } @@ -473,7 +473,7 @@ static int proc_reg_open(struct inode *inode, struct file *file) typeof_member(struct proc_ops, proc_open) open; struct pde_opener *pdeo; - if (!pde->proc_ops->proc_lseek) + if (!pde_has_proc_lseek(pde)) file->f_mode &= ~FMODE_LSEEK; if (pde_is_permanent(pde)) { diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 96122e91c645..e737401d7383 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -44,7 +44,6 @@ struct proc_dir_entry { const struct proc_ops *proc_ops; const struct file_operations *proc_dir_ops; }; - const struct dentry_operations *proc_dops; union { const struct seq_operations *seq_ops; int (*single_show)(struct seq_file *, void *); @@ -99,6 +98,11 @@ static inline bool pde_has_proc_compat_ioctl(const struct proc_dir_entry *pde) #endif } +static inline bool pde_has_proc_lseek(const struct proc_dir_entry *pde) +{ + return pde->flags & PROC_ENTRY_proc_lseek; +} + extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); @@ -379,6 +383,11 @@ struct proc_maps_private { struct task_struct *task; struct mm_struct *mm; struct vma_iterator iter; + loff_t last_pos; +#ifdef CONFIG_PER_VMA_LOCK + bool mmap_locked; + struct vm_area_struct *locked_vma; +#endif #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif @@ -403,7 +412,7 @@ extern const struct dentry_operations proc_net_dentry_ops; static inline void pde_force_lookup(struct proc_dir_entry *pde) { /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */ - pde->proc_dops = &proc_net_dentry_ops; + pde->flags |= PROC_ENTRY_FORCE_LOOKUP; } /* @@ -414,7 +423,6 @@ static inline void pde_force_lookup(struct proc_dir_entry *pde) static inline struct dentry *proc_splice_unmountable(struct inode *inode, struct dentry *dentry, const struct dentry_operations *d_ops) { - d_set_d_op(dentry, d_ops); dont_mount(dentry); - return d_splice_alias(inode, dentry); + return d_splice_alias_ops(inode, dentry, d_ops); } diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index bc2bc60c36cc..a458f1e112fd 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -121,8 +121,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "NFS_Unstable: ", 0); show_val_kb(m, "Bounce: ", 0); - show_val_kb(m, "WritebackTmp: ", - global_node_page_state(NR_WRITEBACK_TEMP)); + show_val_kb(m, "WritebackTmp: ", 0); show_val_kb(m, "CommitLimit: ", vm_commit_limit()); show_val_kb(m, "Committed_AS: ", committed); seq_printf(m, "VmallocTotal: %8lu kB\n", diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index c610224faf10..4403a2e20c16 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -111,8 +111,7 @@ static struct dentry *proc_ns_instantiate(struct dentry *dentry, ei->ns_ops = ns_ops; pid_update_inode(task, inode); - d_set_d_op(dentry, &pid_dentry_operations); - return d_splice_alias(inode, dentry); + return d_splice_alias_ops(inode, dentry, &pid_dentry_operations); } static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/proc/page.c b/fs/proc/page.c index 23fc771100ae..ba3568e97fd1 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -22,6 +22,12 @@ #define KPMMASK (KPMSIZE - 1) #define KPMBITS (KPMSIZE * BITS_PER_BYTE) +enum kpage_operation { + KPAGE_FLAGS, + KPAGE_COUNT, + KPAGE_CGROUP, +}; + static inline unsigned long get_max_dump_pfn(void) { #ifdef CONFIG_SPARSEMEM @@ -37,19 +43,33 @@ static inline unsigned long get_max_dump_pfn(void) #endif } -/* /proc/kpagecount - an array exposing page mapcounts - * - * Each entry is a u64 representing the corresponding - * physical page mapcount. - */ -static ssize_t kpagecount_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +static u64 get_kpage_count(const struct page *page) +{ + struct page_snapshot ps; + u64 ret; + + snapshot_page(&ps, page); + + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) + ret = folio_precise_page_mapcount(&ps.folio_snapshot, + &ps.page_snapshot); + else + ret = folio_average_page_mapcount(&ps.folio_snapshot); + + return ret; +} + +static ssize_t kpage_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos, + enum kpage_operation op) { const unsigned long max_dump_pfn = get_max_dump_pfn(); u64 __user *out = (u64 __user *)buf; + struct page *page; unsigned long src = *ppos; unsigned long pfn; ssize_t ret = 0; + u64 info; pfn = src / KPMSIZE; if (src & KPMMASK || count & KPMMASK) @@ -59,24 +79,31 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); while (count > 0) { - struct page *page; - u64 mapcount = 0; - /* * TODO: ZONE_DEVICE support requires to identify * memmaps that were actually initialized. */ page = pfn_to_online_page(pfn); - if (page) { - struct folio *folio = page_folio(page); - if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) - mapcount = folio_precise_page_mapcount(folio, page); - else - mapcount = folio_average_page_mapcount(folio); - } - - if (put_user(mapcount, out)) { + if (page) { + switch (op) { + case KPAGE_FLAGS: + info = stable_page_flags(page); + break; + case KPAGE_COUNT: + info = get_kpage_count(page); + break; + case KPAGE_CGROUP: + info = page_cgroup_ino(page); + break; + default: + info = 0; + break; + } + } else + info = 0; + + if (put_user(info, out)) { ret = -EFAULT; break; } @@ -94,17 +121,23 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, return ret; } +/* /proc/kpagecount - an array exposing page mapcounts + * + * Each entry is a u64 representing the corresponding + * physical page mapcount. + */ +static ssize_t kpagecount_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return kpage_read(file, buf, count, ppos, KPAGE_COUNT); +} + static const struct proc_ops kpagecount_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, .proc_read = kpagecount_read, }; -/* /proc/kpageflags - an array exposing page flags - * - * Each entry is a u64 representing the corresponding - * physical page flags. - */ static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) { @@ -114,6 +147,7 @@ static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) u64 stable_page_flags(const struct page *page) { const struct folio *folio; + struct page_snapshot ps; unsigned long k; unsigned long mapping; bool is_anon; @@ -125,20 +159,22 @@ u64 stable_page_flags(const struct page *page) */ if (!page) return 1 << KPF_NOPAGE; - folio = page_folio(page); + + snapshot_page(&ps, page); + folio = &ps.folio_snapshot; k = folio->flags; mapping = (unsigned long)folio->mapping; - is_anon = mapping & PAGE_MAPPING_ANON; + is_anon = mapping & FOLIO_MAPPING_ANON; /* * pseudo flags for the well known (anonymous) memory mapped pages */ - if (page_mapped(page)) + if (folio_mapped(folio)) u |= 1 << KPF_MMAP; if (is_anon) { u |= 1 << KPF_ANON; - if (mapping & PAGE_MAPPING_KSM) + if (mapping & FOLIO_MAPPING_KSM) u |= 1 << KPF_KSM; } @@ -146,7 +182,7 @@ u64 stable_page_flags(const struct page *page) * compound pages: export both head/tail info * they together define a compound page's start/end pos and order */ - if (page == &folio->page) + if (ps.idx == 0) u |= kpf_copy_bit(k, KPF_COMPOUND_HEAD, PG_head); else u |= 1 << KPF_COMPOUND_TAIL; @@ -156,25 +192,19 @@ u64 stable_page_flags(const struct page *page) folio_test_large_rmappable(folio)) { /* Note: we indicate any THPs here, not just PMD-sized ones */ u |= 1 << KPF_THP; - } else if (is_huge_zero_folio(folio)) { + } else if (is_huge_zero_pfn(ps.pfn)) { u |= 1 << KPF_ZERO_PAGE; u |= 1 << KPF_THP; - } else if (is_zero_folio(folio)) { + } else if (is_zero_pfn(ps.pfn)) { u |= 1 << KPF_ZERO_PAGE; } - /* - * Caveats on high order pages: PG_buddy and PG_slab will only be set - * on the head page. - */ - if (PageBuddy(page)) - u |= 1 << KPF_BUDDY; - else if (page_count(page) == 0 && is_free_buddy_page(page)) + if (ps.flags & PAGE_SNAPSHOT_PG_BUDDY) u |= 1 << KPF_BUDDY; - if (PageOffline(page)) + if (folio_test_offline(folio)) u |= 1 << KPF_OFFLINE; - if (PageTable(page)) + if (folio_test_pgtable(folio)) u |= 1 << KPF_PGTABLE; if (folio_test_slab(folio)) u |= 1 << KPF_SLAB; @@ -182,7 +212,7 @@ u64 stable_page_flags(const struct page *page) #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) u |= kpf_copy_bit(k, KPF_IDLE, PG_idle); #else - if (folio_test_idle(folio)) + if (ps.flags & PAGE_SNAPSHOT_PG_IDLE) u |= 1 << KPF_IDLE; #endif @@ -208,7 +238,7 @@ u64 stable_page_flags(const struct page *page) if (u & (1 << KPF_HUGE)) u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); else - u |= kpf_copy_bit(page->flags, KPF_HWPOISON, PG_hwpoison); + u |= kpf_copy_bit(ps.page_snapshot.flags, KPF_HWPOISON, PG_hwpoison); #endif u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved); @@ -225,47 +255,17 @@ u64 stable_page_flags(const struct page *page) #endif return u; -}; +} +/* /proc/kpageflags - an array exposing page flags + * + * Each entry is a u64 representing the corresponding + * physical page flags. + */ static ssize_t kpageflags_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos) { - const unsigned long max_dump_pfn = get_max_dump_pfn(); - u64 __user *out = (u64 __user *)buf; - unsigned long src = *ppos; - unsigned long pfn; - ssize_t ret = 0; - - pfn = src / KPMSIZE; - if (src & KPMMASK || count & KPMMASK) - return -EINVAL; - if (src >= max_dump_pfn * KPMSIZE) - return 0; - count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); - - while (count > 0) { - /* - * TODO: ZONE_DEVICE support requires to identify - * memmaps that were actually initialized. - */ - struct page *page = pfn_to_online_page(pfn); - - if (put_user(stable_page_flags(page), out)) { - ret = -EFAULT; - break; - } - - pfn++; - out++; - count -= KPMSIZE; - - cond_resched(); - } - - *ppos += (char __user *)out - buf; - if (!ret) - ret = (char __user *)out - buf; - return ret; + return kpage_read(file, buf, count, ppos, KPAGE_FLAGS); } static const struct proc_ops kpageflags_proc_ops = { @@ -276,53 +276,10 @@ static const struct proc_ops kpageflags_proc_ops = { #ifdef CONFIG_MEMCG static ssize_t kpagecgroup_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos) { - const unsigned long max_dump_pfn = get_max_dump_pfn(); - u64 __user *out = (u64 __user *)buf; - struct page *ppage; - unsigned long src = *ppos; - unsigned long pfn; - ssize_t ret = 0; - u64 ino; - - pfn = src / KPMSIZE; - if (src & KPMMASK || count & KPMMASK) - return -EINVAL; - if (src >= max_dump_pfn * KPMSIZE) - return 0; - count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); - - while (count > 0) { - /* - * TODO: ZONE_DEVICE support requires to identify - * memmaps that were actually initialized. - */ - ppage = pfn_to_online_page(pfn); - - if (ppage) - ino = page_cgroup_ino(ppage); - else - ino = 0; - - if (put_user(ino, out)) { - ret = -EFAULT; - break; - } - - pfn++; - out++; - count -= KPMSIZE; - - cond_resched(); - } - - *ppos += (char __user *)out - buf; - if (!ret) - ret = (char __user *)out - buf; - return ret; + return kpage_read(file, buf, count, ppos, KPAGE_CGROUP); } - static const struct proc_ops kpagecgroup_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index cc9d74a06ff0..49ab74e0bfde 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -540,9 +540,8 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, goto out; } - d_set_d_op(dentry, &proc_sys_dentry_operations); inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); - err = d_splice_alias(inode, dentry); + err = d_splice_alias_ops(inode, dentry, &proc_sys_dentry_operations); out: if (h) @@ -699,9 +698,9 @@ static bool proc_sys_fill_cache(struct file *file, return false; if (d_in_lookup(child)) { struct dentry *res; - d_set_d_op(child, &proc_sys_dentry_operations); inode = proc_sys_make_inode(dir->d_sb, head, table); - res = d_splice_alias(inode, child); + res = d_splice_alias_ops(inode, child, + &proc_sys_dentry_operations); d_lookup_done(child); if (unlikely(res)) { dput(child); @@ -918,17 +917,21 @@ static int proc_sys_compare(const struct dentry *dentry, struct ctl_table_header *head; struct inode *inode; - /* Although proc doesn't have negative dentries, rcu-walk means - * that inode here can be NULL */ - /* AV: can it, indeed? */ - inode = d_inode_rcu(dentry); - if (!inode) - return 1; if (name->len != len) return 1; if (memcmp(name->name, str, len)) return 1; - head = rcu_dereference(PROC_I(inode)->sysctl); + + // false positive is fine here - we'll recheck anyway + if (d_in_lookup(dentry)) + return 0; + + inode = d_inode_rcu(dentry); + // we just might have run into dentry in the middle of __dentry_kill() + if (!inode) + return 1; + + head = READ_ONCE(PROC_I(inode)->sysctl); return !head || !sysctl_is_seen(head); } diff --git a/fs/proc/root.c b/fs/proc/root.c index 06a297a27ba3..ed86ac710384 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -363,12 +363,12 @@ static const struct inode_operations proc_root_inode_operations = { * This is the root "inode" in the /proc tree.. */ struct proc_dir_entry proc_root = { - .low_ino = PROC_ROOT_INO, - .namelen = 5, - .mode = S_IFDIR | S_IRUGO | S_IXUGO, - .nlink = 2, + .low_ino = PROCFS_ROOT_INO, + .namelen = 5, + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .nlink = 2, .refcnt = REFCOUNT_INIT(1), - .proc_iops = &proc_root_inode_operations, + .proc_iops = &proc_root_inode_operations, .proc_dir_ops = &proc_root_operations, .parent = &proc_root, .subdir = RB_ROOT, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 994cde10e3f4..3d6d8a9f13fc 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -29,6 +29,9 @@ #include <asm/tlbflush.h> #include "internal.h" +#define SENTINEL_VMA_END -1 +#define SENTINEL_VMA_GATE -2 + #define SEQ_PUT_DEC(str, val) \ seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8) void task_mem(struct seq_file *m, struct mm_struct *mm) @@ -36,9 +39,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) unsigned long text, lib, swap, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; - anon = get_mm_counter(mm, MM_ANONPAGES); - file = get_mm_counter(mm, MM_FILEPAGES); - shmem = get_mm_counter(mm, MM_SHMEMPAGES); + anon = get_mm_counter_sum(mm, MM_ANONPAGES); + file = get_mm_counter_sum(mm, MM_FILEPAGES); + shmem = get_mm_counter_sum(mm, MM_SHMEMPAGES); /* * Note: to minimize their overhead, mm maintains hiwater_vm and @@ -59,7 +62,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) text = min(text, mm->exec_vm << PAGE_SHIFT); lib = (mm->exec_vm << PAGE_SHIFT) - text; - swap = get_mm_counter(mm, MM_SWAPENTS); + swap = get_mm_counter_sum(mm, MM_SWAPENTS); SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); @@ -92,12 +95,12 @@ unsigned long task_statm(struct mm_struct *mm, unsigned long *shared, unsigned long *text, unsigned long *data, unsigned long *resident) { - *shared = get_mm_counter(mm, MM_FILEPAGES) + - get_mm_counter(mm, MM_SHMEMPAGES); + *shared = get_mm_counter_sum(mm, MM_FILEPAGES) + + get_mm_counter_sum(mm, MM_SHMEMPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; *data = mm->data_vm + mm->stack_vm; - *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); + *resident = *shared + get_mm_counter_sum(mm, MM_ANONPAGES); return mm->total_vm; } @@ -127,15 +130,134 @@ static void release_task_mempolicy(struct proc_maps_private *priv) } #endif -static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, - loff_t *ppos) +#ifdef CONFIG_PER_VMA_LOCK + +static void unlock_vma(struct proc_maps_private *priv) +{ + if (priv->locked_vma) { + vma_end_read(priv->locked_vma); + priv->locked_vma = NULL; + } +} + +static const struct seq_operations proc_pid_maps_op; + +static inline bool lock_vma_range(struct seq_file *m, + struct proc_maps_private *priv) +{ + /* + * smaps and numa_maps perform page table walk, therefore require + * mmap_lock but maps can be read with locking just the vma and + * walking the vma tree under rcu read protection. + */ + if (m->op != &proc_pid_maps_op) { + if (mmap_read_lock_killable(priv->mm)) + return false; + + priv->mmap_locked = true; + } else { + rcu_read_lock(); + priv->locked_vma = NULL; + priv->mmap_locked = false; + } + + return true; +} + +static inline void unlock_vma_range(struct proc_maps_private *priv) { - struct vm_area_struct *vma = vma_next(&priv->iter); + if (priv->mmap_locked) { + mmap_read_unlock(priv->mm); + } else { + unlock_vma(priv); + rcu_read_unlock(); + } +} + +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, + loff_t last_pos) +{ + struct vm_area_struct *vma; + + if (priv->mmap_locked) + return vma_next(&priv->iter); + + unlock_vma(priv); + vma = lock_next_vma(priv->mm, &priv->iter, last_pos); + if (!IS_ERR_OR_NULL(vma)) + priv->locked_vma = vma; + + return vma; +} + +static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, + loff_t pos) +{ + if (priv->mmap_locked) + return false; + + rcu_read_unlock(); + mmap_read_lock(priv->mm); + /* Reinitialize the iterator after taking mmap_lock */ + vma_iter_set(&priv->iter, pos); + priv->mmap_locked = true; + + return true; +} +#else /* CONFIG_PER_VMA_LOCK */ + +static inline bool lock_vma_range(struct seq_file *m, + struct proc_maps_private *priv) +{ + return mmap_read_lock_killable(priv->mm) == 0; +} + +static inline void unlock_vma_range(struct proc_maps_private *priv) +{ + mmap_read_unlock(priv->mm); +} + +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, + loff_t last_pos) +{ + return vma_next(&priv->iter); +} + +static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, + loff_t pos) +{ + return false; +} + +#endif /* CONFIG_PER_VMA_LOCK */ + +static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma; + +retry: + vma = get_next_vma(priv, *ppos); + /* EINTR of EAGAIN is possible */ + if (IS_ERR(vma)) { + if (PTR_ERR(vma) == -EAGAIN && fallback_to_mmap_lock(priv, *ppos)) + goto retry; + + return vma; + } + + /* Store previous position to be able to restart if needed */ + priv->last_pos = *ppos; if (vma) { - *ppos = vma->vm_start; + /* + * Track the end of the reported vma to ensure position changes + * even if previous vma was merged with the next vma and we + * found the extended vma with the same vm_start. + */ + *ppos = vma->vm_end; } else { - *ppos = -2UL; + *ppos = SENTINEL_VMA_GATE; vma = get_gate_vma(priv->mm); } @@ -145,11 +267,11 @@ static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; - unsigned long last_addr = *ppos; + loff_t last_addr = *ppos; struct mm_struct *mm; /* See m_next(). Zero at the start or after lseek. */ - if (last_addr == -1UL) + if (last_addr == SENTINEL_VMA_END) return NULL; priv->task = get_proc_task(priv->inode); @@ -163,28 +285,34 @@ static void *m_start(struct seq_file *m, loff_t *ppos) return NULL; } - if (mmap_read_lock_killable(mm)) { + if (!lock_vma_range(m, priv)) { mmput(mm); put_task_struct(priv->task); priv->task = NULL; return ERR_PTR(-EINTR); } - vma_iter_init(&priv->iter, mm, last_addr); + /* + * Reset current position if last_addr was set before + * and it's not a sentinel. + */ + if (last_addr > 0) + *ppos = last_addr = priv->last_pos; + vma_iter_init(&priv->iter, mm, (unsigned long)last_addr); hold_task_mempolicy(priv); - if (last_addr == -2UL) + if (last_addr == SENTINEL_VMA_GATE) return get_gate_vma(mm); - return proc_get_vma(priv, ppos); + return proc_get_vma(m, ppos); } static void *m_next(struct seq_file *m, void *v, loff_t *ppos) { - if (*ppos == -2UL) { - *ppos = -1UL; + if (*ppos == SENTINEL_VMA_GATE) { + *ppos = SENTINEL_VMA_END; return NULL; } - return proc_get_vma(m->private, ppos); + return proc_get_vma(m, ppos); } static void m_stop(struct seq_file *m, void *v) @@ -196,7 +324,7 @@ static void m_stop(struct seq_file *m, void *v) return; release_task_mempolicy(priv); - mmap_read_unlock(mm); + unlock_vma_range(priv); mmput(mm); put_task_struct(priv->task); priv->task = NULL; @@ -212,8 +340,8 @@ static int proc_maps_open(struct inode *inode, struct file *file, priv->inode = inode; priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(priv->mm)) { - int err = PTR_ERR(priv->mm); + if (IS_ERR_OR_NULL(priv->mm)) { + int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; seq_release_private(inode, file); return err; @@ -1325,8 +1453,8 @@ static int smaps_rollup_open(struct inode *inode, struct file *file) priv->inode = inode; priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(priv->mm)) { - ret = PTR_ERR(priv->mm); + if (IS_ERR_OR_NULL(priv->mm)) { + ret = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; single_release(inode, file); goto out_free; @@ -2069,8 +2197,8 @@ static int pagemap_open(struct inode *inode, struct file *file) struct mm_struct *mm; mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(mm)) - return PTR_ERR(mm); + if (IS_ERR_OR_NULL(mm)) + return mm ? PTR_ERR(mm) : -ESRCH; file->private_data = mm; return 0; } @@ -2087,7 +2215,8 @@ static int pagemap_release(struct inode *inode, struct file *file) #define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ PAGE_IS_FILE | PAGE_IS_PRESENT | \ PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ - PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY) + PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY | \ + PAGE_IS_GUARD) #define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) struct pagemap_scan_private { @@ -2128,12 +2257,14 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; - if (p->masks_of_interest & PAGE_IS_FILE) { - swp = pte_to_swp_entry(pte); - if (is_pfn_swap_entry(swp) && - !folio_test_anon(pfn_swap_entry_folio(swp))) - categories |= PAGE_IS_FILE; - } + swp = pte_to_swp_entry(pte); + if (is_guard_swp_entry(swp)) + categories |= PAGE_IS_GUARD; + else if ((p->masks_of_interest & PAGE_IS_FILE) && + is_pfn_swap_entry(swp) && + !folio_test_anon(pfn_swap_entry_folio(swp))) + categories |= PAGE_IS_FILE; + if (pte_swp_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } @@ -2179,7 +2310,7 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, categories |= PAGE_IS_FILE; } - if (is_zero_pfn(pmd_pfn(pmd))) + if (is_huge_zero_pmd(pmd)) categories |= PAGE_IS_PFNZERO; if (pmd_soft_dirty(pmd)) categories |= PAGE_IS_SOFT_DIRTY; diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index bce674533000..59bfd61d653a 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -260,8 +260,8 @@ static int maps_open(struct inode *inode, struct file *file, priv->inode = inode; priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(priv->mm)) { - int err = PTR_ERR(priv->mm); + if (IS_ERR_OR_NULL(priv->mm)) { + int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; seq_release_private(inode, file); return err; diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index bb3b769edc71..1a2e1185426c 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -300,7 +300,7 @@ static struct dentry *psinfo_lock_root(void) return NULL; root = pstore_sb->s_root; - inode_lock(d_inode(root)); + inode_lock_nested(d_inode(root), I_MUTEX_PARENT); return root; } @@ -318,8 +318,7 @@ int pstore_put_backend_records(struct pstore_info *psi) list_for_each_entry_safe(pos, tmp, &records_list, list) { if (pos->record->psi == psi) { list_del_init(&pos->list); - d_invalidate(pos->dentry); - simple_unlink(d_inode(root), pos->dentry); + locked_recursive_removal(pos->dentry, NULL); pos->dentry = NULL; } } diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index b45c7edc3225..b11f5b20b78b 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -41,7 +41,7 @@ static unsigned long ramfs_mmu_get_unmapped_area(struct file *file, const struct file_operations ramfs_file_operations = { .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, - .mmap = generic_file_mmap, + .mmap_prepare = generic_file_mmap_prepare, .fsync = noop_fsync, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 7a6d980e614d..77b8ca2757e0 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -28,7 +28,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, unsigned long len, unsigned long pgoff, unsigned long flags); -static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma); +static int ramfs_nommu_mmap_prepare(struct vm_area_desc *desc); static unsigned ramfs_mmap_capabilities(struct file *file) { @@ -38,7 +38,7 @@ static unsigned ramfs_mmap_capabilities(struct file *file) const struct file_operations ramfs_file_operations = { .mmap_capabilities = ramfs_mmap_capabilities, - .mmap = ramfs_nommu_mmap, + .mmap_prepare = ramfs_nommu_mmap_prepare, .get_unmapped_area = ramfs_nommu_get_unmapped_area, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, @@ -262,12 +262,12 @@ out: /* * set up a mapping for shared memory segments */ -static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) +static int ramfs_nommu_mmap_prepare(struct vm_area_desc *desc) { - if (!is_nommu_shared_mapping(vma->vm_flags)) + if (!is_nommu_shared_mapping(desc->vm_flags)) return -ENOSYS; - file_accessed(file); - vma->vm_ops = &generic_file_vm_ops; + file_accessed(desc->file); + desc->vm_ops = &generic_file_vm_ops; return 0; } diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 775fa905fda0..f8874c3b8c1e 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -269,6 +269,7 @@ static int ramfs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = RAMFS_MAGIC; sb->s_op = &ramfs_ops; + sb->s_d_flags = DCACHE_DONTCACHE; sb->s_time_gran = 1; inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0); diff --git a/fs/read_write.c b/fs/read_write.c index 0ef70e128c4a..c5b6265d984b 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -28,7 +28,7 @@ const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, - .mmap = generic_file_readonly_mmap, + .mmap_prepare = generic_file_readonly_mmap_prepare, .splice_read = filemap_splice_read, }; @@ -237,7 +237,7 @@ EXPORT_SYMBOL(generic_llseek_cookie); * @offset: file offset to seek to * @whence: type of seek * - * This is a generic implemenation of ->llseek useable for all normal local + * This is a generic implementation of ->llseek useable for all normal local * filesystems. It just updates the file offset to the value specified by * @offset and @whence. */ diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 6ed2dfd4dbbd..d98e0d2de09f 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -594,9 +594,10 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) struct rmid_read rr = {0}; struct rdt_mon_domain *d; struct rdtgroup *rdtgrp; + int domid, cpu, ret = 0; struct rdt_resource *r; + struct cacheinfo *ci; struct mon_data *md; - int domid, ret = 0; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { @@ -623,10 +624,14 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) * one that matches this cache id. */ list_for_each_entry(d, &r->mon_domains, hdr.list) { - if (d->ci->id == domid) { - rr.ci = d->ci; + if (d->ci_id == domid) { + rr.ci_id = d->ci_id; + cpu = cpumask_any(&d->hdr.cpu_mask); + ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); + if (!ci) + continue; mon_event_read(&rr, r, NULL, rdtgrp, - &d->ci->shared_cpu_map, evtid, false); + &ci->shared_cpu_map, evtid, false); goto checkresult; } } diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 9a8cf6f11151..0a1eedba2b03 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -98,7 +98,7 @@ struct mon_data { * domains in @r sharing L3 @ci.id * @evtid: Which monitor event to read. * @first: Initialize MBM counter when true. - * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. + * @ci_id: Cacheinfo id for L3. Only set when @d is NULL. Used when summing domains. * @err: Error encountered when reading counter. * @val: Returned value of event counter. If @rgrp is a parent resource group, * @val includes the sum of event counts from its child resource groups. @@ -112,7 +112,7 @@ struct rmid_read { struct rdt_mon_domain *d; enum resctrl_event_id evtid; bool first; - struct cacheinfo *ci; + unsigned int ci_id; int err; u64 val; void *arch_mon_ctx; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index bde2801289d3..f5637855c3ac 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -361,6 +361,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) { int cpu = smp_processor_id(); struct rdt_mon_domain *d; + struct cacheinfo *ci; struct mbm_state *m; int err, ret; u64 tval = 0; @@ -388,7 +389,8 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) } /* Summing domains that share a cache, must be on a CPU for that cache. */ - if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) + ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); + if (!ci || ci->id != rr->ci_id) return -EINVAL; /* @@ -400,7 +402,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) */ ret = -EINVAL; list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { - if (d->ci->id != rr->ci->id) + if (d->ci_id != rr->ci_id) continue; err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, rr->evtid, &tval, rr->arch_mon_ctx); diff --git a/fs/resctrl/pseudo_lock.c b/fs/resctrl/pseudo_lock.c index ccc2f9213b4b..87bbc2605de1 100644 --- a/fs/resctrl/pseudo_lock.c +++ b/fs/resctrl/pseudo_lock.c @@ -764,13 +764,9 @@ static ssize_t pseudo_lock_measure_trigger(struct file *file, if (ret == 0) { if (sel != 1 && sel != 2 && sel != 3) return -EINVAL; - ret = debugfs_file_get(file->f_path.dentry); - if (ret) - return ret; ret = pseudo_lock_measure_cycles(rdtgrp, sel); if (ret == 0) ret = count; - debugfs_file_put(file->f_path.dentry); } return ret; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index cc37f58b47dd..77d08229d855 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -536,6 +536,8 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, goto unlock; } + rdt_last_cmd_clear(); + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { ret = -EINVAL; @@ -3034,7 +3036,7 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, char name[32]; snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id); if (snc_mode) sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id); @@ -3059,7 +3061,7 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, return -EPERM; list_for_each_entry(mevt, &r->evt_list, list) { - domid = do_sum ? d->ci->id : d->hdr.id; + domid = do_sum ? d->ci_id : d->hdr.id; priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum); if (WARN_ON_ONCE(!priv)) return -EINVAL; @@ -3087,7 +3089,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, lockdep_assert_held(&rdtgroup_mutex); snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id); kn = kernfs_find_and_get(parent_kn, name); if (kn) { /* @@ -3472,6 +3474,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, goto out_unlock; } + rdt_last_cmd_clear(); + /* * Check that the parent directory for a monitor group is a "mon_groups" * directory. diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c index 4520ca413867..4b77c6dc4418 100644 --- a/fs/romfs/mmap-nommu.c +++ b/fs/romfs/mmap-nommu.c @@ -61,9 +61,9 @@ static unsigned long romfs_get_unmapped_area(struct file *file, * permit a R/O mapping to be made directly through onto an MTD device if * possible */ -static int romfs_mmap(struct file *file, struct vm_area_struct *vma) +static int romfs_mmap_prepare(struct vm_area_desc *desc) { - return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -ENOSYS; + return is_nommu_shared_mapping(desc->vm_flags) ? 0 : -ENOSYS; } static unsigned romfs_mmap_capabilities(struct file *file) @@ -79,7 +79,7 @@ const struct file_operations romfs_ro_fops = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .splice_read = filemap_splice_read, - .mmap = romfs_mmap, + .mmap_prepare = romfs_mmap_prepare, .get_unmapped_area = romfs_get_unmapped_area, .mmap_capabilities = romfs_mmap_capabilities, }; diff --git a/fs/select.c b/fs/select.c index 9fb650d03d52..082cf60c7e23 100644 --- a/fs/select.c +++ b/fs/select.c @@ -192,7 +192,7 @@ static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *k * and is paired with smp_store_mb() in poll_schedule_timeout. */ smp_wmb(); - pwq->triggered = 1; + WRITE_ONCE(pwq->triggered, 1); /* * Perform the default wake up operation using a dummy @@ -237,7 +237,7 @@ static int poll_schedule_timeout(struct poll_wqueues *pwq, int state, int rc = -EINTR; set_current_state(state); - if (!pwq->triggered) + if (!READ_ONCE(pwq->triggered)) rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index 89d2dbbb742c..b69daeb1301b 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -155,6 +155,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, struct cached_fids *cfids; const char *npath; int retries = 0, cur_sleep = 1; + __le32 lease_flags = 0; if (cifs_sb->root == NULL) return -ENOENT; @@ -194,6 +195,7 @@ replay_again: * from @cfids->entries. Caller will put last reference if the latter. */ if (cfid->has_lease && cfid->time) { + cfid->last_access_time = jiffies; spin_unlock(&cfids->cfid_list_lock); *ret_cfid = cfid; kfree(utf16_path); @@ -201,6 +203,8 @@ replay_again: } spin_unlock(&cfids->cfid_list_lock); + pfid = &cfid->fid; + /* * Skip any prefix paths in @path as lookup_noperm_positive_unlocked() ends up * calling ->lookup() which already adds those through @@ -222,6 +226,25 @@ replay_again: rc = -ENOENT; goto out; } + if (dentry->d_parent && server->dialect >= SMB30_PROT_ID) { + struct cached_fid *parent_cfid; + + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry(parent_cfid, &cfids->entries, entry) { + if (parent_cfid->dentry == dentry->d_parent) { + cifs_dbg(FYI, "found a parent cached file handle\n"); + if (parent_cfid->has_lease && parent_cfid->time) { + lease_flags + |= SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE; + memcpy(pfid->parent_lease_key, + parent_cfid->fid.lease_key, + SMB2_LEASE_KEY_SIZE); + } + break; + } + } + spin_unlock(&cfids->cfid_list_lock); + } } cfid->dentry = dentry; cfid->tcon = tcon; @@ -236,7 +259,6 @@ replay_again: if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - pfid = &cfid->fid; server->ops->new_lease_key(pfid); memset(rqst, 0, sizeof(rqst)); @@ -256,6 +278,7 @@ replay_again: FILE_READ_EA, .disposition = FILE_OPEN, .fid = pfid, + .lease_flags = lease_flags, .replay = !!(retries), }; @@ -341,6 +364,7 @@ replay_again: cfid->file_all_info_is_valid = true; cfid->time = jiffies; + cfid->last_access_time = jiffies; spin_unlock(&cfids->cfid_list_lock); /* At this point the directory handle is fully cached */ rc = 0; @@ -487,8 +511,17 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb) spin_lock(&cfids->cfid_list_lock); list_for_each_entry(cfid, &cfids->entries, entry) { tmp_list = kmalloc(sizeof(*tmp_list), GFP_ATOMIC); - if (tmp_list == NULL) - break; + if (tmp_list == NULL) { + /* + * If the malloc() fails, we won't drop all + * dentries, and unmounting is likely to trigger + * a 'Dentry still in use' error. + */ + cifs_tcon_dbg(VFS, "Out of memory while dropping dentries\n"); + spin_unlock(&cfids->cfid_list_lock); + spin_unlock(&cifs_sb->tlink_tree_lock); + goto done; + } spin_lock(&cfid->fid_lock); tmp_list->dentry = cfid->dentry; cfid->dentry = NULL; @@ -500,6 +533,7 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb) } spin_unlock(&cifs_sb->tlink_tree_lock); +done: list_for_each_entry_safe(tmp_list, q, &entry, entry) { list_del(&tmp_list->entry); dput(tmp_list->dentry); @@ -585,7 +619,7 @@ static void cached_dir_put_work(struct work_struct *work) queue_work(serverclose_wq, &cfid->close_work); } -int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]) +bool cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]) { struct cached_fids *cfids = tcon->cfids; struct cached_fid *cfid; @@ -698,8 +732,8 @@ static void cfids_laundromat_worker(struct work_struct *work) spin_lock(&cfids->cfid_list_lock); list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { - if (cfid->time && - time_after(jiffies, cfid->time + HZ * dir_cache_timeout)) { + if (cfid->last_access_time && + time_after(jiffies, cfid->last_access_time + HZ * dir_cache_timeout)) { cfid->on_list = false; list_move(&cfid->entry, &entry); cfids->num_entries--; diff --git a/fs/smb/client/cached_dir.h b/fs/smb/client/cached_dir.h index 1dfe79d947a6..46b5a2fdf15b 100644 --- a/fs/smb/client/cached_dir.h +++ b/fs/smb/client/cached_dir.h @@ -14,19 +14,18 @@ struct cached_dirent { char *name; int namelen; loff_t pos; - struct cifs_fattr fattr; }; struct cached_dirents { bool is_valid:1; bool is_failed:1; - struct dir_context *ctx; /* - * Only used to make sure we only take entries - * from a single context. Never dereferenced. - */ + struct file *file; /* + * Used to associate the cache with a single + * open file instance. + */ struct mutex de_mutex; - int pos; /* Expected ctx->pos */ + loff_t pos; /* Expected ctx->pos */ struct list_head entries; }; @@ -39,6 +38,7 @@ struct cached_fid { bool on_list:1; bool file_all_info_is_valid:1; unsigned long time; /* jiffies of when lease was taken */ + unsigned long last_access_time; /* jiffies of when last accessed */ struct kref refcount; struct cifs_fid fid; spinlock_t fid_lock; @@ -80,6 +80,6 @@ extern void drop_cached_dir_by_name(const unsigned int xid, struct cifs_sb_info *cifs_sb); extern void close_all_cached_dirs(struct cifs_sb_info *cifs_sb); extern void invalidate_all_cached_dirs(struct cifs_tcon *tcon); -extern int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]); +extern bool cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]); #endif /* _CACHED_DIR_H */ diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index e03c890de0a0..f1cea365b6f1 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -26,6 +26,7 @@ #include "smbdirect.h" #endif #include "cifs_swn.h" +#include "cached_dir.h" void cifs_dump_mem(char *label, void *data, int length) @@ -280,6 +281,54 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v) return 0; } +static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v) +{ + struct list_head *stmp, *tmp, *tmp1; + struct TCP_Server_Info *server; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + struct cached_fids *cfids; + struct cached_fid *cfid; + LIST_HEAD(entry); + + seq_puts(m, "# Version:1\n"); + seq_puts(m, "# Format:\n"); + seq_puts(m, "# <tree id> <sess id> <persistent fid> <path>\n"); + + spin_lock(&cifs_tcp_ses_lock); + list_for_each(stmp, &cifs_tcp_ses_list) { + server = list_entry(stmp, struct TCP_Server_Info, + tcp_ses_list); + list_for_each(tmp, &server->smb_ses_list) { + ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + list_for_each(tmp1, &ses->tcon_list) { + tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); + cfids = tcon->cfids; + spin_lock(&cfids->cfid_list_lock); /* check lock ordering */ + seq_printf(m, "Num entries: %d\n", cfids->num_entries); + list_for_each_entry(cfid, &cfids->entries, entry) { + seq_printf(m, "0x%x 0x%llx 0x%llx %s", + tcon->tid, + ses->Suid, + cfid->fid.persistent_fid, + cfid->path); + if (cfid->file_all_info_is_valid) + seq_printf(m, "\tvalid file info"); + if (cfid->dirents.is_valid) + seq_printf(m, ", valid dirents"); + seq_printf(m, "\n"); + } + spin_unlock(&cfids->cfid_list_lock); + + + } + } + } + spin_unlock(&cifs_tcp_ses_lock); + seq_putc(m, '\n'); + return 0; +} + static __always_inline const char *compression_alg_str(__le16 alg) { switch (alg) { @@ -362,6 +411,10 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) c = 0; spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { +#ifdef CONFIG_CIFS_SMB_DIRECT + struct smbdirect_socket_parameters *sp; +#endif + /* channel info will be printed as a part of sessions below */ if (SERVER_IS_CHAN(server)) continue; @@ -383,25 +436,26 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) seq_printf(m, "\nSMBDirect transport not available"); goto skip_rdma; } + sp = &server->smbd_conn->socket.parameters; seq_printf(m, "\nSMBDirect (in hex) protocol version: %x " "transport status: %x", server->smbd_conn->protocol, - server->smbd_conn->transport_status); + server->smbd_conn->socket.status); seq_printf(m, "\nConn receive_credit_max: %x " "send_credit_target: %x max_send_size: %x", - server->smbd_conn->receive_credit_max, - server->smbd_conn->send_credit_target, - server->smbd_conn->max_send_size); + sp->recv_credit_max, + sp->send_credit_target, + sp->max_send_size); seq_printf(m, "\nConn max_fragmented_recv_size: %x " "max_fragmented_send_size: %x max_receive_size:%x", - server->smbd_conn->max_fragmented_recv_size, - server->smbd_conn->max_fragmented_send_size, - server->smbd_conn->max_receive_size); + sp->max_fragmented_recv_size, + sp->max_fragmented_send_size, + sp->max_recv_size); seq_printf(m, "\nConn keep_alive_interval: %x " "max_readwrite_size: %x rdma_readwrite_threshold: %x", - server->smbd_conn->keep_alive_interval, - server->smbd_conn->max_readwrite_size, + sp->keepalive_interval_msec * 1000, + sp->max_read_write_size, server->smbd_conn->rdma_readwrite_threshold); seq_printf(m, "\nDebug count_get_receive_buffer: %x " "count_put_receive_buffer: %x count_send_empty: %x", @@ -858,6 +912,9 @@ cifs_proc_init(void) proc_create_single("open_files", 0400, proc_fs_cifs, cifs_debug_files_proc_show); + proc_create_single("open_dirs", 0400, proc_fs_cifs, + cifs_debug_dirs_proc_show); + proc_create("Stats", 0644, proc_fs_cifs, &cifs_stats_proc_ops); proc_create("cifsFYI", 0644, proc_fs_cifs, &cifsFYI_proc_ops); proc_create("traceSMB", 0644, proc_fs_cifs, &traceSMB_proc_ops); @@ -902,6 +959,7 @@ cifs_proc_clean(void) remove_proc_entry("DebugData", proc_fs_cifs); remove_proc_entry("open_files", proc_fs_cifs); + remove_proc_entry("open_dirs", proc_fs_cifs); remove_proc_entry("cifsFYI", proc_fs_cifs); remove_proc_entry("traceSMB", proc_fs_cifs); remove_proc_entry("Stats", proc_fs_cifs); @@ -1100,7 +1158,7 @@ static ssize_t cifs_security_flags_proc_write(struct file *file, if ((count < 1) || (count > 11)) return -EINVAL; - memset(flags_string, 0, 12); + memset(flags_string, 0, sizeof(flags_string)); if (copy_from_user(flags_string, buffer, count)) return -EFAULT; diff --git a/fs/smb/client/cifs_ioctl.h b/fs/smb/client/cifs_ioctl.h index 26327442e383..b51ce64fcccf 100644 --- a/fs/smb/client/cifs_ioctl.h +++ b/fs/smb/client/cifs_ioctl.h @@ -61,7 +61,7 @@ struct smb_query_info { struct smb3_key_debug_info { __u64 Suid; __u16 cipher_type; - __u8 auth_key[16]; /* SMB2_NTLMV2_SESSKEY_SIZE */ + __u8 auth_key[SMB2_NTLMV2_SESSKEY_SIZE]; __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE]; __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE]; } __packed; diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c index 35892df7335c..3cc686246908 100644 --- a/fs/smb/client/cifsencrypt.c +++ b/fs/smb/client/cifsencrypt.c @@ -343,7 +343,7 @@ static struct ntlmssp2_name *find_next_av(struct cifs_ses *ses, len = AV_LEN(av); if (AV_TYPE(av) == NTLMSSP_AV_EOL) return NULL; - if (!len || (u8 *)av + sizeof(*av) + len > end) + if ((u8 *)av + sizeof(*av) + len > end) return NULL; return av; } @@ -363,7 +363,7 @@ static int find_av_name(struct cifs_ses *ses, u16 type, char **name, u16 maxlen) av_for_each_entry(ses, av) { len = AV_LEN(av); - if (AV_TYPE(av) != type) + if (AV_TYPE(av) != type || !len) continue; if (!IS_ALIGNED(len, sizeof(__le16))) { cifs_dbg(VFS | ONCE, "%s: bad length(%u) for type %u\n", @@ -532,17 +532,67 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash, struct shash_ return rc; } +/* + * Set up NTLMv2 response blob with SPN (cifs/<hostname>) appended to the + * existing list of AV pairs. + */ +static int set_auth_key_response(struct cifs_ses *ses) +{ + size_t baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp); + size_t len, spnlen, tilen = 0, num_avs = 2 /* SPN + EOL */; + struct TCP_Server_Info *server = ses->server; + char *spn __free(kfree) = NULL; + struct ntlmssp2_name *av; + char *rsp = NULL; + int rc; + + spnlen = strlen(server->hostname); + len = sizeof("cifs/") + spnlen; + spn = kmalloc(len, GFP_KERNEL); + if (!spn) { + rc = -ENOMEM; + goto out; + } + + spnlen = scnprintf(spn, len, "cifs/%.*s", + (int)spnlen, server->hostname); + + av_for_each_entry(ses, av) + tilen += sizeof(*av) + AV_LEN(av); + + len = baselen + tilen + spnlen * sizeof(__le16) + num_avs * sizeof(*av); + rsp = kmalloc(len, GFP_KERNEL); + if (!rsp) { + rc = -ENOMEM; + goto out; + } + + memcpy(rsp + baselen, ses->auth_key.response, tilen); + av = (void *)(rsp + baselen + tilen); + av->type = cpu_to_le16(NTLMSSP_AV_TARGET_NAME); + av->length = cpu_to_le16(spnlen * sizeof(__le16)); + cifs_strtoUTF16((__le16 *)av->data, spn, spnlen, ses->local_nls); + av = (void *)((__u8 *)av + sizeof(*av) + AV_LEN(av)); + av->type = cpu_to_le16(NTLMSSP_AV_EOL); + av->length = 0; + + rc = 0; + ses->auth_key.len = len; +out: + ses->auth_key.response = rsp; + return rc; +} + int setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) { struct shash_desc *hmacmd5 = NULL; - int rc; - int baselen; - unsigned int tilen; + unsigned char *tiblob = NULL; /* target info blob */ struct ntlmv2_resp *ntlmv2; char ntlmv2_hash[16]; - unsigned char *tiblob = NULL; /* target info blob */ __le64 rsp_timestamp; + __u64 cc; + int rc; if (nls_cp == NULL) { cifs_dbg(VFS, "%s called with nls_cp==NULL\n", __func__); @@ -588,32 +638,25 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) * (as Windows 7 does) */ rsp_timestamp = find_timestamp(ses); + get_random_bytes(&cc, sizeof(cc)); - baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp); - tilen = ses->auth_key.len; - tiblob = ses->auth_key.response; + cifs_server_lock(ses->server); - ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL); - if (!ses->auth_key.response) { - rc = -ENOMEM; + tiblob = ses->auth_key.response; + rc = set_auth_key_response(ses); + if (rc) { ses->auth_key.len = 0; - goto setup_ntlmv2_rsp_ret; + goto unlock; } - ses->auth_key.len += baselen; ntlmv2 = (struct ntlmv2_resp *) (ses->auth_key.response + CIFS_SESS_KEY_SIZE); ntlmv2->blob_signature = cpu_to_le32(0x00000101); ntlmv2->reserved = 0; ntlmv2->time = rsp_timestamp; - - get_random_bytes(&ntlmv2->client_chal, sizeof(ntlmv2->client_chal)); + ntlmv2->client_chal = cc; ntlmv2->reserved2 = 0; - memcpy(ses->auth_key.response + baselen, tiblob, tilen); - - cifs_server_lock(ses->server); - rc = cifs_alloc_hash("hmac(md5)", &hmacmd5); if (rc) { cifs_dbg(VFS, "Could not allocate HMAC-MD5, rc=%d\n", rc); diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index fb04e263611c..0fdadd668a81 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -70,7 +70,6 @@ bool require_gcm_256; /* false by default */ bool enable_negotiate_signing; /* false by default */ unsigned int global_secflags = CIFSSEC_DEF; /* unsigned int ntlmv2_support = 0; */ -unsigned int sign_CIFS_PDUs = 1; /* * Global transaction id (XID) information @@ -261,9 +260,9 @@ cifs_read_super(struct super_block *sb) } if (tcon->nocase) - sb->s_d_op = &cifs_ci_dentry_ops; + set_default_d_op(sb, &cifs_ci_dentry_ops); else - sb->s_d_op = &cifs_dentry_ops; + set_default_d_op(sb, &cifs_dentry_ops); sb->s_root = d_make_root(inode); if (!sb->s_root) { @@ -1526,7 +1525,7 @@ const struct file_operations cifs_file_ops = { .flock = cifs_flock, .fsync = cifs_fsync, .flush = cifs_flush, - .mmap = cifs_file_mmap, + .mmap_prepare = cifs_file_mmap_prepare, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, @@ -1546,7 +1545,7 @@ const struct file_operations cifs_file_strict_ops = { .flock = cifs_flock, .fsync = cifs_strict_fsync, .flush = cifs_flush, - .mmap = cifs_file_strict_mmap, + .mmap_prepare = cifs_file_strict_mmap_prepare, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, @@ -1566,7 +1565,7 @@ const struct file_operations cifs_file_direct_ops = { .flock = cifs_flock, .fsync = cifs_fsync, .flush = cifs_flush, - .mmap = cifs_file_mmap, + .mmap_prepare = cifs_file_mmap_prepare, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, .unlocked_ioctl = cifs_ioctl, @@ -1584,7 +1583,7 @@ const struct file_operations cifs_file_nobrl_ops = { .release = cifs_close, .fsync = cifs_fsync, .flush = cifs_flush, - .mmap = cifs_file_mmap, + .mmap_prepare = cifs_file_mmap_prepare, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, @@ -1602,7 +1601,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = { .release = cifs_close, .fsync = cifs_strict_fsync, .flush = cifs_flush, - .mmap = cifs_file_strict_mmap, + .mmap_prepare = cifs_file_strict_mmap_prepare, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, @@ -1620,7 +1619,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = { .release = cifs_close, .fsync = cifs_fsync, .flush = cifs_flush, - .mmap = cifs_file_mmap, + .mmap_prepare = cifs_file_mmap_prepare, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, .unlocked_ioctl = cifs_ioctl, diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h index ca435a3841b8..487f39cff77e 100644 --- a/fs/smb/client/cifsfs.h +++ b/fs/smb/client/cifsfs.h @@ -103,8 +103,8 @@ extern int cifs_lock(struct file *, int, struct file_lock *); extern int cifs_fsync(struct file *, loff_t, loff_t, int); extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int); extern int cifs_flush(struct file *, fl_owner_t id); -extern int cifs_file_mmap(struct file *file, struct vm_area_struct *vma); -extern int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma); +int cifs_file_mmap_prepare(struct vm_area_desc *desc); +int cifs_file_strict_mmap_prepare(struct vm_area_desc *desc); extern const struct file_operations cifs_dir_ops; extern int cifs_readdir(struct file *file, struct dir_context *ctx); @@ -145,6 +145,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 54 -#define CIFS_VERSION "2.54" +#define SMB3_PRODUCT_BUILD 55 +#define CIFS_VERSION "2.55" #endif /* _CIFSFS_H */ diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 3b32116b0b49..19dd901fe8ab 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -556,7 +556,7 @@ struct smb_version_operations { void (*set_oplock_level)(struct cifsInodeInfo *cinode, __u32 oplock, __u16 epoch, bool *purge_cache); /* create lease context buffer for CREATE request */ - char * (*create_lease_buf)(u8 *lease_key, u8 oplock); + char * (*create_lease_buf)(u8 *lease_key, u8 oplock, u8 *parent_lease_key, __le32 le_flags); /* parse lease context buffer and return oplock/epoch info */ __u8 (*parse_lease_buf)(void *buf, __u16 *epoch, char *lkey); ssize_t (*copychunk_range)(const unsigned int, @@ -627,12 +627,14 @@ struct smb_version_operations { bool (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv); struct reparse_data_buffer * (*get_reparse_point_buffer)(const struct kvec *rsp_iov, u32 *plen); - int (*create_reparse_symlink)(const unsigned int xid, - struct inode *inode, - struct dentry *dentry, - struct cifs_tcon *tcon, - const char *full_path, - const char *symname); + struct inode * (*create_reparse_inode)(struct cifs_open_info_data *data, + struct super_block *sb, + const unsigned int xid, + struct cifs_tcon *tcon, + const char *full_path, + bool directory, + struct kvec *reparse_iov, + struct kvec *xattr_iov); }; struct smb_version_values { @@ -709,6 +711,7 @@ inc_rfc1001_len(void *buf, int count) struct TCP_Server_Info { struct list_head tcp_ses_list; struct list_head smb_ses_list; + struct list_head rlist; /* reconnect list */ spinlock_t srv_lock; /* protect anything here that is not protected */ __u64 conn_id; /* connection identifier (useful for debugging) */ int srv_count; /* reference counter */ @@ -773,8 +776,10 @@ struct TCP_Server_Info { char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; __u32 sequence_number; /* for signing, protected by srv_mutex */ __u32 reconnect_instance; /* incremented on each reconnect */ + __le32 session_key_id; /* retrieved from negotiate response and send in session setup request */ struct session_key session_key; unsigned long lstrp; /* when we got last response from this server */ + unsigned long neg_start; /* when negotiate started (jiffies) */ struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */ #define CIFS_NEGFLAVOR_UNENCAP 1 /* wct == 17, but no ext_sec */ #define CIFS_NEGFLAVOR_EXTENDED 2 /* wct == 17, ext_sec bit set */ @@ -1084,6 +1089,7 @@ struct cifs_chan { }; #define CIFS_SES_FLAG_SCALE_CHANNELS (0x1) +#define CIFS_SES_FLAGS_PENDING_QUERY_INTERFACES (0x2) /* * Session structure. One of these for each uid session with a particular host @@ -1300,6 +1306,7 @@ struct cifs_tcon { bool use_persistent:1; /* use persistent instead of durable handles */ bool no_lease:1; /* Do not request leases on files or directories */ bool use_witness:1; /* use witness protocol */ + bool dummy:1; /* dummy tcon used for reconnecting channels */ __le32 capabilities; __u32 share_flags; __u32 maximal_access; @@ -1441,6 +1448,7 @@ struct cifs_open_parms { bool reconnect:1; bool replay:1; /* indicates that this open is for a replay */ struct kvec *ea_cctx; + __le32 lease_flags; }; struct cifs_fid { @@ -1448,6 +1456,7 @@ struct cifs_fid { __u64 persistent_fid; /* persist file id for smb2 */ __u64 volatile_fid; /* volatile file id for smb2 */ __u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for smb2 */ + __u8 parent_lease_key[SMB2_LEASE_KEY_SIZE]; __u8 create_guid[16]; __u32 access; struct cifs_pending_open *pending_open; @@ -1988,8 +1997,7 @@ require use of the stronger protocol */ * TCP_Server_Info-> TCP_Server_Info cifs_get_tcp_session * reconnect_mutex * TCP_Server_Info->srv_mutex TCP_Server_Info cifs_get_tcp_session - * cifs_ses->session_mutex cifs_ses sesInfoAlloc - * cifs_tcon + * cifs_ses->session_mutex cifs_ses sesInfoAlloc * cifs_tcon->open_file_lock cifs_tcon->openFileList tconInfoAlloc * cifs_tcon->pending_opens * cifs_tcon->stat_lock cifs_tcon->bytes_read tconInfoAlloc @@ -2008,21 +2016,25 @@ require use of the stronger protocol */ * ->oplock_credits * ->reconnect_instance * cifs_ses->ses_lock (anything that is not protected by another lock and can change) + * sesInfoAlloc * cifs_ses->iface_lock cifs_ses->iface_list sesInfoAlloc * ->iface_count * ->iface_last_update - * cifs_ses->chan_lock cifs_ses->chans + * cifs_ses->chan_lock cifs_ses->chans sesInfoAlloc * ->chans_need_reconnect * ->chans_in_reconnect * cifs_tcon->tc_lock (anything that is not protected by another lock and can change) + * tcon_info_alloc * inode->i_rwsem, taken by fs/netfs/locking.c e.g. should be taken before cifsInodeInfo locks * cifsInodeInfo->open_file_lock cifsInodeInfo->openFileList cifs_alloc_inode * cifsInodeInfo->writers_lock cifsInodeInfo->writers cifsInodeInfo_alloc * cifsInodeInfo->lock_sem cifsInodeInfo->llist cifs_init_once * ->can_cache_brlcks * cifsInodeInfo->deferred_lock cifsInodeInfo->deferred_closes cifsInodeInfo_alloc - * cached_fids->cfid_list_lock cifs_tcon->cfids->entries init_cached_dirs - * cifsFileInfo->fh_mutex cifsFileInfo cifs_new_fileinfo + * cached_fids->cfid_list_lock cifs_tcon->cfids->entries init_cached_dirs + * cached_fid->fid_lock (anything that is not protected by another lock and can change) + * init_cached_dir + * cifsFileInfo->fh_mutex cifsFileInfo cifs_new_fileinfo * cifsFileInfo->file_info_lock cifsFileInfo->count cifs_new_fileinfo * ->invalidHandle initiate_cifs_search * ->oplock_break_cancelled diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h index 1b79fe07476f..d9cf7db0ac35 100644 --- a/fs/smb/client/cifspdu.h +++ b/fs/smb/client/cifspdu.h @@ -597,7 +597,7 @@ typedef union smb_com_session_setup_andx { __le16 MaxBufferSize; __le16 MaxMpxCount; __le16 VcNumber; - __u32 SessionKey; + __le32 SessionKey; __le16 SecurityBlobLength; __u32 Reserved; __le32 Capabilities; /* see below */ @@ -616,7 +616,7 @@ typedef union smb_com_session_setup_andx { __le16 MaxBufferSize; __le16 MaxMpxCount; __le16 VcNumber; - __u32 SessionKey; + __le32 SessionKey; __le16 CaseInsensitivePasswordLength; /* ASCII password len */ __le16 CaseSensitivePasswordLength; /* Unicode password length*/ __u32 Reserved; /* see below */ @@ -654,7 +654,7 @@ typedef union smb_com_session_setup_andx { __le16 MaxBufferSize; __le16 MaxMpxCount; __le16 VcNumber; - __u32 SessionKey; + __le32 SessionKey; __le16 PasswordLength; __u32 Reserved; /* encrypt key len and offset */ __le16 ByteCount; diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index ecf774a8f1ca..40ec0634377f 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -136,6 +136,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid, struct smb_hdr *out_buf, int *bytes_returned); +void smb2_query_server_interfaces(struct work_struct *work); void cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server, bool all_channels); @@ -151,8 +152,7 @@ extern bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 eof, bool from_readdir); extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, unsigned int bytes_written); -void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result, - bool was_async); +void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result); extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, int); extern int cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, int flags, @@ -483,6 +483,14 @@ extern int cifs_query_reparse_point(const unsigned int xid, const char *full_path, u32 *tag, struct kvec *rsp, int *rsp_buftype); +extern struct inode *cifs_create_reparse_inode(struct cifs_open_info_data *data, + struct super_block *sb, + const unsigned int xid, + struct cifs_tcon *tcon, + const char *full_path, + bool directory, + struct kvec *reparse_iov, + struct kvec *xattr_iov); extern int CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid); extern int CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms, diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index f55457b4b82e..6c890db06593 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -498,6 +498,7 @@ CIFSSMBNegotiate(const unsigned int xid, server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); cifs_dbg(NOISY, "Max buf = %d\n", ses->server->maxBuf); server->capabilities = le32_to_cpu(pSMBr->Capabilities); + server->session_key_id = pSMBr->SessionKey; server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone); server->timeAdj *= 60; @@ -1333,7 +1334,12 @@ cifs_readv_callback(struct mid_q_entry *mid) cifs_stats_bytes_read(tcon, rdata->got_bytes); break; case MID_REQUEST_SUBMITTED: + trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_req_submitted); + goto do_retry; case MID_RETRY_NEEDED: + trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_retry_needed); +do_retry: + __set_bit(NETFS_SREQ_NEED_RETRY, &rdata->subreq.flags); rdata->result = -EAGAIN; if (server->sign && rdata->got_bytes) /* reset bytes number since we can not check a sign */ @@ -1342,8 +1348,14 @@ cifs_readv_callback(struct mid_q_entry *mid) task_io_account_read(rdata->got_bytes); cifs_stats_bytes_read(tcon, rdata->got_bytes); break; + case MID_RESPONSE_MALFORMED: + trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_malformed); + rdata->result = -EIO; + break; default: + trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_unknown); rdata->result = -EIO; + break; } if (rdata->result == -ENODATA) { @@ -1712,10 +1724,21 @@ cifs_writev_callback(struct mid_q_entry *mid) } break; case MID_REQUEST_SUBMITTED: + trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_req_submitted); + __set_bit(NETFS_SREQ_NEED_RETRY, &wdata->subreq.flags); + result = -EAGAIN; + break; case MID_RETRY_NEEDED: + trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_retry_needed); + __set_bit(NETFS_SREQ_NEED_RETRY, &wdata->subreq.flags); result = -EAGAIN; break; + case MID_RESPONSE_MALFORMED: + trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_malformed); + result = -EIO; + break; default: + trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_unknown); result = -EIO; break; } @@ -1725,7 +1748,7 @@ cifs_writev_callback(struct mid_q_entry *mid) server->credits, server->in_flight, 0, cifs_trace_rw_credits_write_response_clear); wdata->credits.value = 0; - cifs_write_subrequest_terminated(wdata, result, true); + cifs_write_subrequest_terminated(wdata, result); release_mid(mid); trace_smb3_rw_credits(credits.rreq_debug_id, credits.rreq_debug_index, 0, server->credits, server->in_flight, @@ -1813,7 +1836,7 @@ async_writev_out: out: if (rc) { add_credits_and_wake_if(wdata->server, &wdata->credits, 0); - cifs_write_subrequest_terminated(wdata, rc, false); + cifs_write_subrequest_terminated(wdata, rc); } } @@ -2753,10 +2776,10 @@ int cifs_query_reparse_point(const unsigned int xid, io_req->TotalParameterCount = 0; io_req->TotalDataCount = 0; - io_req->MaxParameterCount = cpu_to_le32(2); + io_req->MaxParameterCount = cpu_to_le32(0); /* BB find exact data count max from sess structure BB */ io_req->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00); - io_req->MaxSetupCount = 4; + io_req->MaxSetupCount = 1; io_req->Reserved = 0; io_req->ParameterOffset = 0; io_req->DataCount = 0; @@ -2783,6 +2806,22 @@ int cifs_query_reparse_point(const unsigned int xid, goto error; } + /* SetupCount must be 1, otherwise offset to ByteCount is incorrect. */ + if (io_rsp->SetupCount != 1) { + rc = -EIO; + goto error; + } + + /* + * ReturnedDataLen is output length of executed IOCTL. + * DataCount is output length transferred over network. + * Check that we have full FSCTL_GET_REPARSE_POINT buffer. + */ + if (data_count != le16_to_cpu(io_rsp->ReturnedDataLen)) { + rc = -EIO; + goto error; + } + end = 2 + get_bcc(&io_rsp->hdr) + (__u8 *)&io_rsp->ByteCount; start = (__u8 *)&io_rsp->hdr.Protocol + data_offset; if (start >= end) { @@ -2812,6 +2851,134 @@ error: return rc; } +struct inode *cifs_create_reparse_inode(struct cifs_open_info_data *data, + struct super_block *sb, + const unsigned int xid, + struct cifs_tcon *tcon, + const char *full_path, + bool directory, + struct kvec *reparse_iov, + struct kvec *xattr_iov) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_open_parms oparms; + TRANSACT_IOCTL_REQ *io_req; + struct inode *new = NULL; + struct kvec in_iov[2]; + struct kvec out_iov; + struct cifs_fid fid; + int io_req_len; + int oplock = 0; + int buf_type = 0; + int rc; + + cifs_tcon_dbg(FYI, "%s: path=%s\n", __func__, full_path); + + /* + * If server filesystem does not support reparse points then do not + * attempt to create reparse point. This will prevent creating unusable + * empty object on the server. + */ + if (!(le32_to_cpu(tcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS)) + return ERR_PTR(-EOPNOTSUPP); + +#ifndef CONFIG_CIFS_XATTR + if (xattr_iov) + return ERR_PTR(-EOPNOTSUPP); +#endif + + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, + FILE_READ_ATTRIBUTES | FILE_WRITE_DATA | FILE_WRITE_EA, + FILE_CREATE, + (directory ? CREATE_NOT_FILE : CREATE_NOT_DIR) | OPEN_REPARSE_POINT, + ACL_NO_MODE); + oparms.fid = &fid; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); + if (rc) + return ERR_PTR(rc); + +#ifdef CONFIG_CIFS_XATTR + if (xattr_iov) { + struct smb2_file_full_ea_info *ea; + + ea = &((struct smb2_create_ea_ctx *)xattr_iov->iov_base)->ea; + while (1) { + rc = CIFSSMBSetEA(xid, + tcon, + full_path, + &ea->ea_data[0], + &ea->ea_data[ea->ea_name_length+1], + le16_to_cpu(ea->ea_value_length), + cifs_sb->local_nls, + cifs_sb); + if (rc) + goto out_close; + if (le32_to_cpu(ea->next_entry_offset) == 0) + break; + ea = (struct smb2_file_full_ea_info *)((u8 *)ea + + le32_to_cpu(ea->next_entry_offset)); + } + } +#endif + + rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **)&io_req, NULL); + if (rc) + goto out_close; + + inc_rfc1001_len(io_req, sizeof(io_req->Pad)); + + io_req_len = be32_to_cpu(io_req->hdr.smb_buf_length) + sizeof(io_req->hdr.smb_buf_length); + + /* NT IOCTL response contains one-word long output setup buffer with size of output data. */ + io_req->MaxSetupCount = 1; + /* NT IOCTL response does not contain output parameters. */ + io_req->MaxParameterCount = cpu_to_le32(0); + /* FSCTL_SET_REPARSE_POINT response contains empty output data. */ + io_req->MaxDataCount = cpu_to_le32(0); + + io_req->TotalParameterCount = cpu_to_le32(0); + io_req->TotalDataCount = cpu_to_le32(reparse_iov->iov_len); + io_req->ParameterCount = io_req->TotalParameterCount; + io_req->ParameterOffset = cpu_to_le32(0); + io_req->DataCount = io_req->TotalDataCount; + io_req->DataOffset = cpu_to_le32(offsetof(typeof(*io_req), Data) - + sizeof(io_req->hdr.smb_buf_length)); + io_req->SetupCount = 4; + io_req->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL); + io_req->FunctionCode = cpu_to_le32(FSCTL_SET_REPARSE_POINT); + io_req->Fid = fid.netfid; + io_req->IsFsctl = 1; + io_req->IsRootFlag = 0; + io_req->ByteCount = cpu_to_le16(le32_to_cpu(io_req->DataCount) + sizeof(io_req->Pad)); + + inc_rfc1001_len(io_req, reparse_iov->iov_len); + + in_iov[0].iov_base = (char *)io_req; + in_iov[0].iov_len = io_req_len; + in_iov[1] = *reparse_iov; + rc = SendReceive2(xid, tcon->ses, in_iov, ARRAY_SIZE(in_iov), &buf_type, + CIFS_NO_RSP_BUF, &out_iov); + + cifs_buf_release(io_req); + + if (!rc) + rc = cifs_get_inode_info(&new, full_path, data, sb, xid, NULL); + +out_close: + CIFSSMBClose(xid, tcon, fid.netfid); + + /* + * If CREATE was successful but FSCTL_SET_REPARSE_POINT failed then + * remove the intermediate object created by CREATE. Otherwise + * empty object stay on the server when reparse call failed. + */ + if (rc) + CIFSSMBDelFile(xid, tcon, full_path, cifs_sb, NULL); + + return rc ? ERR_PTR(rc) : new; +} + int CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid) @@ -3981,6 +4148,12 @@ findFirstRetry: pSMB->FileName[name_len] = 0; pSMB->FileName[name_len+1] = 0; name_len += 2; + } else if (!searchName[0]) { + pSMB->FileName[0] = CIFS_DIR_SEP(cifs_sb); + pSMB->FileName[1] = 0; + pSMB->FileName[2] = 0; + pSMB->FileName[3] = 0; + name_len = 4; } } else { name_len = copy_path_name(pSMB->FileName, searchName); @@ -3992,6 +4165,10 @@ findFirstRetry: pSMB->FileName[name_len] = '*'; pSMB->FileName[name_len+1] = 0; name_len += 2; + } else if (!searchName[0]) { + pSMB->FileName[0] = CIFS_DIR_SEP(cifs_sb); + pSMB->FileName[1] = 0; + name_len = 2; } } @@ -4018,7 +4195,7 @@ findFirstRetry: pSMB->SearchAttributes = cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM | ATTR_DIRECTORY); - pSMB->SearchCount = cpu_to_le16(CIFSMaxBufSize/sizeof(FILE_UNIX_INFO)); + pSMB->SearchCount = cpu_to_le16(msearch ? CIFSMaxBufSize/sizeof(FILE_UNIX_INFO) : 1); pSMB->SearchFlags = cpu_to_le16(search_flags); pSMB->InformationLevel = cpu_to_le16(psrch_inf->info_level); diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 6bf04d9a5491..5eec8957f2a9 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -97,7 +97,7 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server) return rc; } -static void smb2_query_server_interfaces(struct work_struct *work) +void smb2_query_server_interfaces(struct work_struct *work) { int rc; int xid; @@ -116,18 +116,22 @@ static void smb2_query_server_interfaces(struct work_struct *work) rc = server->ops->query_server_interfaces(xid, tcon, false); free_xid(xid); - if (rc) { - if (rc == -EOPNOTSUPP) - return; - + if (rc) cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n", __func__, rc); - } queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, (SMB_INTERFACE_POLL_INTERVAL * HZ)); } +#define set_need_reco(server) \ +do { \ + spin_lock(&server->srv_lock); \ + if (server->tcpStatus != CifsExiting) \ + server->tcpStatus = CifsNeedReconnect; \ + spin_unlock(&server->srv_lock); \ +} while (0) + /* * Update the tcpStatus for the server. * This is used to signal the cifsd thread to call cifs_reconnect @@ -141,39 +145,45 @@ void cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server, bool all_channels) { - struct TCP_Server_Info *pserver; + struct TCP_Server_Info *nserver; struct cifs_ses *ses; + LIST_HEAD(reco); int i; - /* If server is a channel, select the primary channel */ - pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; - /* if we need to signal just this channel */ if (!all_channels) { - spin_lock(&server->srv_lock); - if (server->tcpStatus != CifsExiting) - server->tcpStatus = CifsNeedReconnect; - spin_unlock(&server->srv_lock); + set_need_reco(server); return; } - spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { - if (cifs_ses_exiting(ses)) - continue; - spin_lock(&ses->chan_lock); - for (i = 0; i < ses->chan_count; i++) { - if (!ses->chans[i].server) + if (SERVER_IS_CHAN(server)) + server = server->primary_server; + scoped_guard(spinlock, &cifs_tcp_ses_lock) { + set_need_reco(server); + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + spin_lock(&ses->ses_lock); + if (ses->ses_status == SES_EXITING) { + spin_unlock(&ses->ses_lock); continue; - - spin_lock(&ses->chans[i].server->srv_lock); - if (ses->chans[i].server->tcpStatus != CifsExiting) - ses->chans[i].server->tcpStatus = CifsNeedReconnect; - spin_unlock(&ses->chans[i].server->srv_lock); + } + spin_lock(&ses->chan_lock); + for (i = 1; i < ses->chan_count; i++) { + nserver = ses->chans[i].server; + if (!nserver) + continue; + nserver->srv_count++; + list_add(&nserver->rlist, &reco); + } + spin_unlock(&ses->chan_lock); + spin_unlock(&ses->ses_lock); } - spin_unlock(&ses->chan_lock); } - spin_unlock(&cifs_tcp_ses_lock); + + list_for_each_entry_safe(server, nserver, &reco, rlist) { + list_del_init(&server->rlist); + set_need_reco(server); + cifs_put_tcp_session(server, 0); + } } /* @@ -377,6 +387,13 @@ static int __cifs_reconnect(struct TCP_Server_Info *server, if (!cifs_tcp_ses_needs_reconnect(server, 1)) return 0; + /* + * if smb session has been marked for reconnect, also reconnect all + * connections. This way, the other connections do not end up bad. + */ + if (mark_smb_session) + cifs_signal_cifsd_for_reconnect(server, mark_smb_session); + cifs_mark_tcp_ses_conns_for_reconnect(server, mark_smb_session); cifs_abort_connection(server); @@ -385,7 +402,8 @@ static int __cifs_reconnect(struct TCP_Server_Info *server, try_to_freeze(); cifs_server_lock(server); - if (!cifs_swn_set_server_dstaddr(server)) { + if (!cifs_swn_set_server_dstaddr(server) && + !SERVER_IS_CHAN(server)) { /* resolve the hostname again to make sure that IP address is up-to-date */ rc = reconn_set_ipaddr_from_hostname(server); cifs_dbg(FYI, "%s: reconn_set_ipaddr_from_hostname: rc=%d\n", __func__, rc); @@ -661,12 +679,12 @@ server_unresponsive(struct TCP_Server_Info *server) /* * If we're in the process of mounting a share or reconnecting a session * and the server abruptly shut down (e.g. socket wasn't closed, packet - * had been ACK'ed but no SMB response), don't wait longer than 20s to - * negotiate protocol. + * had been ACK'ed but no SMB response), don't wait longer than 20s from + * when negotiate actually started. */ spin_lock(&server->srv_lock); if (server->tcpStatus == CifsInNegotiate && - time_after(jiffies, server->lstrp + 20 * HZ)) { + time_after(jiffies, server->neg_start + 20 * HZ)) { spin_unlock(&server->srv_lock); cifs_reconnect(server, false); return true; @@ -2862,20 +2880,14 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) tcon->max_cached_dirs = ctx->max_cached_dirs; tcon->nodelete = ctx->nodelete; tcon->local_lease = ctx->local_lease; - INIT_LIST_HEAD(&tcon->pending_opens); tcon->status = TID_GOOD; - INIT_DELAYED_WORK(&tcon->query_interfaces, - smb2_query_server_interfaces); if (ses->server->dialect >= SMB30_PROT_ID && (ses->server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { /* schedule query interfaces poll */ queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, (SMB_INTERFACE_POLL_INTERVAL * HZ)); } -#ifdef CONFIG_CIFS_DFS_UPCALL - INIT_DELAYED_WORK(&tcon->dfs_cache_work, dfs_cache_refresh); -#endif spin_lock(&cifs_tcp_ses_lock); list_add(&tcon->tcon_list, &ses->tcon_list); spin_unlock(&cifs_tcp_ses_lock); @@ -3350,18 +3362,15 @@ generic_ip_connect(struct TCP_Server_Info *server) struct net *net = cifs_net_ns(server); struct sock *sk; - rc = __sock_create(net, sfamily, SOCK_STREAM, - IPPROTO_TCP, &server->ssocket, 1); + rc = sock_create_kern(net, sfamily, SOCK_STREAM, + IPPROTO_TCP, &server->ssocket); if (rc < 0) { cifs_server_dbg(VFS, "Error %d creating socket\n", rc); return rc; } sk = server->ssocket->sk; - __netns_tracker_free(net, &sk->ns_tracker, false); - sk->sk_net_refcnt = 1; - get_net_track(net, &sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); + sk_net_refcnt_upgrade(sk); /* BB other socket options to set KEEPALIVE, NODELAY? */ cifs_dbg(FYI, "Socket created\n"); @@ -3714,9 +3723,15 @@ int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx) goto out; } - /* if new SMB3.11 POSIX extensions are supported do not remap / and \ */ - if (tcon->posix_extensions) + /* + * if new SMB3.11 POSIX extensions are supported, do not change anything in the + * path (i.e., do not remap / and \ and do not map any special characters) + */ + if (tcon->posix_extensions) { cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS; + cifs_sb->mnt_cifs_flags &= ~(CIFS_MOUNT_MAP_SFM_CHR | + CIFS_MOUNT_MAP_SPECIAL_CHR); + } #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY /* tell server which Unix caps we support */ @@ -4189,7 +4204,9 @@ retry: return 0; } + server->lstrp = jiffies; server->tcpStatus = CifsInNegotiate; + server->neg_start = jiffies; spin_unlock(&server->srv_lock); rc = server->ops->negotiate(xid, ses, server); diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c index d1e95632ac54..5223edf6d11a 100644 --- a/fs/smb/client/dir.c +++ b/fs/smb/client/dir.c @@ -23,6 +23,7 @@ #include "fs_context.h" #include "cifs_ioctl.h" #include "fscache.h" +#include "cached_dir.h" static void renew_parental_timestamps(struct dentry *direntry) @@ -189,7 +190,9 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int disposition; struct TCP_Server_Info *server = tcon->ses->server; struct cifs_open_parms oparms; + struct cached_fid *parent_cfid = NULL; int rdwr_for_fscache = 0; + __le32 lease_flags = 0; *oplock = 0; if (tcon->ses->server->oplocks) @@ -311,7 +314,28 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned if (!tcon->unix_ext && (mode & S_IWUGO) == 0) create_options |= CREATE_OPTION_READONLY; + retry_open: + if (tcon->cfids && direntry->d_parent && server->dialect >= SMB30_PROT_ID) { + parent_cfid = NULL; + spin_lock(&tcon->cfids->cfid_list_lock); + list_for_each_entry(parent_cfid, &tcon->cfids->entries, entry) { + if (parent_cfid->dentry == direntry->d_parent) { + cifs_dbg(FYI, "found a parent cached file handle\n"); + if (parent_cfid->has_lease && parent_cfid->time) { + lease_flags + |= SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE; + memcpy(fid->parent_lease_key, + parent_cfid->fid.lease_key, + SMB2_LEASE_KEY_SIZE); + parent_cfid->dirents.is_valid = false; + } + break; + } + } + spin_unlock(&tcon->cfids->cfid_list_lock); + } + oparms = (struct cifs_open_parms) { .tcon = tcon, .cifs_sb = cifs_sb, @@ -320,6 +344,7 @@ retry_open: .disposition = disposition, .path = full_path, .fid = fid, + .lease_flags = lease_flags, .mode = mode, }; rc = server->ops->open(xid, &oparms, oplock, buf); diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 950aa4f912f5..186e061068be 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -52,6 +52,7 @@ static void cifs_prepare_write(struct netfs_io_subrequest *subreq) struct netfs_io_stream *stream = &req->rreq.io_streams[subreq->stream_nr]; struct TCP_Server_Info *server; struct cifsFileInfo *open_file = req->cfile; + struct cifs_sb_info *cifs_sb = CIFS_SB(wdata->rreq->inode->i_sb); size_t wsize = req->rreq.wsize; int rc; @@ -63,6 +64,10 @@ static void cifs_prepare_write(struct netfs_io_subrequest *subreq) server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses); wdata->server = server; + if (cifs_sb->ctx->wsize == 0) + cifs_negotiate_wsize(server, cifs_sb->ctx, + tlink_tcon(req->cfile->tlink)); + retry: if (open_file->invalidHandle) { rc = cifs_reopen_file(open_file, false); @@ -130,7 +135,7 @@ fail: else trace_netfs_sreq(subreq, netfs_sreq_trace_fail); add_credits_and_wake_if(wdata->server, &wdata->credits, 0); - cifs_write_subrequest_terminated(wdata, rc, false); + cifs_write_subrequest_terminated(wdata, rc); goto out; } @@ -160,10 +165,9 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq) server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); rdata->server = server; - if (cifs_sb->ctx->rsize == 0) { + if (cifs_sb->ctx->rsize == 0) cifs_negotiate_rsize(server, cifs_sb->ctx, tlink_tcon(req->cfile->tlink)); - } rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, &size, &rdata->credits); @@ -219,7 +223,8 @@ static void cifs_issue_read(struct netfs_io_subrequest *subreq) goto failed; } - if (subreq->rreq->origin != NETFS_DIO_READ) + if (subreq->rreq->origin != NETFS_UNBUFFERED_READ && + subreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); trace_netfs_sreq(subreq, netfs_sreq_trace_submit); @@ -998,15 +1003,18 @@ int cifs_open(struct inode *inode, struct file *file) rc = cifs_get_readable_path(tcon, full_path, &cfile); } if (rc == 0) { - if (file->f_flags == cfile->f_flags) { + unsigned int oflags = file->f_flags & ~(O_CREAT|O_EXCL|O_TRUNC); + unsigned int cflags = cfile->f_flags & ~(O_CREAT|O_EXCL|O_TRUNC); + + if (cifs_convert_flags(oflags, 0) == cifs_convert_flags(cflags, 0) && + (oflags & (O_SYNC|O_DIRECT)) == (cflags & (O_SYNC|O_DIRECT))) { file->private_data = cfile; spin_lock(&CIFS_I(inode)->deferred_lock); cifs_del_deferred_close(cfile); spin_unlock(&CIFS_I(inode)->deferred_lock); goto use_cache; - } else { - _cifsFileInfo_put(cfile, true, false); } + _cifsFileInfo_put(cfile, true, false); } else { /* hard link on the defeered close file */ rc = cifs_get_hardlink_path(tcon, inode, file); @@ -2423,8 +2431,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock) return rc; } -void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result, - bool was_async) +void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result) { struct netfs_io_request *wreq = wdata->rreq; struct netfs_inode *ictx = netfs_inode(wreq->inode); @@ -2441,7 +2448,7 @@ void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t netfs_resize_file(ictx, wrend, true); } - netfs_write_subrequest_terminated(&wdata->subreq, result, was_async); + netfs_write_subrequest_terminated(&wdata->subreq, result); } struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, @@ -2992,38 +2999,38 @@ static const struct vm_operations_struct cifs_file_vm_ops = { .page_mkwrite = cifs_page_mkwrite, }; -int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) +int cifs_file_strict_mmap_prepare(struct vm_area_desc *desc) { int xid, rc = 0; - struct inode *inode = file_inode(file); + struct inode *inode = file_inode(desc->file); xid = get_xid(); if (!CIFS_CACHE_READ(CIFS_I(inode))) rc = cifs_zap_mapping(inode); if (!rc) - rc = generic_file_mmap(file, vma); + rc = generic_file_mmap_prepare(desc); if (!rc) - vma->vm_ops = &cifs_file_vm_ops; + desc->vm_ops = &cifs_file_vm_ops; free_xid(xid); return rc; } -int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) +int cifs_file_mmap_prepare(struct vm_area_desc *desc) { int rc, xid; xid = get_xid(); - rc = cifs_revalidate_file(file); + rc = cifs_revalidate_file(desc->file); if (rc) cifs_dbg(FYI, "Validation prior to mmap failed, error=%d\n", rc); if (!rc) - rc = generic_file_mmap(file, vma); + rc = generic_file_mmap_prepare(desc); if (!rc) - vma->vm_ops = &cifs_file_vm_ops; + desc->vm_ops = &cifs_file_vm_ops; free_xid(xid); return rc; @@ -3081,7 +3088,8 @@ void cifs_oplock_break(struct work_struct *work) struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, oplock_break); struct inode *inode = d_inode(cfile->dentry); - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct super_block *sb = inode->i_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifsInodeInfo *cinode = CIFS_I(inode); struct cifs_tcon *tcon; struct TCP_Server_Info *server; @@ -3091,6 +3099,12 @@ void cifs_oplock_break(struct work_struct *work) __u64 persistent_fid, volatile_fid; __u16 net_fid; + /* + * Hold a reference to the superblock to prevent it and its inodes from + * being freed while we are accessing cinode. Otherwise, _cifsFileInfo_put() + * may release the last reference to the sb and trigger inode eviction. + */ + cifs_sb_active(sb); wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, TASK_UNINTERRUPTIBLE); @@ -3163,6 +3177,7 @@ oplock_break_ack: cifs_put_tlink(tlink); out: cifs_done_oplock_break(cinode); + cifs_sb_deactive(sb); } static int cifs_swap_activate(struct swap_info_struct *sis, diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index a634a34d4086..3f34bb07997b 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -1475,35 +1475,21 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, pr_warn("username too long\n"); goto cifs_parse_mount_err; } - ctx->username = kstrdup(param->string, GFP_KERNEL); - if (ctx->username == NULL) { - cifs_errorf(fc, "OOM when copying username string\n"); - goto cifs_parse_mount_err; - } + ctx->username = no_free_ptr(param->string); break; case Opt_pass: kfree_sensitive(ctx->password); ctx->password = NULL; if (strlen(param->string) == 0) break; - - ctx->password = kstrdup(param->string, GFP_KERNEL); - if (ctx->password == NULL) { - cifs_errorf(fc, "OOM when copying password string\n"); - goto cifs_parse_mount_err; - } + ctx->password = no_free_ptr(param->string); break; case Opt_pass2: kfree_sensitive(ctx->password2); ctx->password2 = NULL; if (strlen(param->string) == 0) break; - - ctx->password2 = kstrdup(param->string, GFP_KERNEL); - if (ctx->password2 == NULL) { - cifs_errorf(fc, "OOM when copying password2 string\n"); - goto cifs_parse_mount_err; - } + ctx->password2 = no_free_ptr(param->string); break; case Opt_ip: if (strlen(param->string) == 0) { @@ -1526,11 +1512,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, } kfree(ctx->domainname); - ctx->domainname = kstrdup(param->string, GFP_KERNEL); - if (ctx->domainname == NULL) { - cifs_errorf(fc, "OOM when copying domainname string\n"); - goto cifs_parse_mount_err; - } + ctx->domainname = no_free_ptr(param->string); cifs_dbg(FYI, "Domain name set\n"); break; case Opt_srcaddr: @@ -1550,11 +1532,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, if (strncasecmp(param->string, "default", 7) != 0) { kfree(ctx->iocharset); - ctx->iocharset = kstrdup(param->string, GFP_KERNEL); - if (ctx->iocharset == NULL) { - cifs_errorf(fc, "OOM when copying iocharset string\n"); - goto cifs_parse_mount_err; - } + ctx->iocharset = no_free_ptr(param->string); } /* if iocharset not set then load_nls_default * is used by caller @@ -1824,10 +1802,14 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, cifs_errorf(fc, "symlinkroot mount options must be absolute path\n"); goto cifs_parse_mount_err; } - kfree(ctx->symlinkroot); - ctx->symlinkroot = kstrdup(param->string, GFP_KERNEL); - if (!ctx->symlinkroot) + if (strnlen(param->string, PATH_MAX) == PATH_MAX) { + cifs_errorf(fc, "symlinkroot path too long (max path length: %u)\n", + PATH_MAX - 1); goto cifs_parse_mount_err; + } + kfree(ctx->symlinkroot); + ctx->symlinkroot = param->string; + param->string = NULL; break; } /* case Opt_ignore: - is ignored as expected ... */ @@ -1837,13 +1819,6 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, goto cifs_parse_mount_err; } - /* - * By default resolve all native absolute symlinks relative to "/mnt/". - * Same default has drvfs driver running in WSL for resolving SMB shares. - */ - if (!ctx->symlinkroot) - ctx->symlinkroot = kstrdup("/mnt/", GFP_KERNEL); - return 0; cifs_parse_mount_err: diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c index 56439da4f119..0a9935ce05a5 100644 --- a/fs/smb/client/ioctl.c +++ b/fs/smb/client/ioctl.c @@ -506,7 +506,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) le16_to_cpu(tcon->ses->server->cipher_type); pkey_inf.Suid = tcon->ses->Suid; memcpy(pkey_inf.auth_key, tcon->ses->auth_key.response, - 16 /* SMB2_NTLMV2_SESSKEY_SIZE */); + SMB2_NTLMV2_SESSKEY_SIZE); memcpy(pkey_inf.smb3decryptionkey, tcon->ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE); memcpy(pkey_inf.smb3encryptionkey, diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c index 769752ad2c5c..2ecd705e9e8c 100644 --- a/fs/smb/client/link.c +++ b/fs/smb/client/link.c @@ -19,6 +19,7 @@ #include "smb2proto.h" #include "cifs_ioctl.h" #include "fs_context.h" +#include "reparse.h" /* * M-F Symlink Functions - Begin @@ -570,7 +571,6 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, int rc = -EOPNOTSUPP; unsigned int xid; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - struct TCP_Server_Info *server; struct tcon_link *tlink; struct cifs_tcon *pTcon; const char *full_path; @@ -593,7 +593,6 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, goto symlink_exit; } pTcon = tlink_tcon(tlink); - server = cifs_pick_channel(pTcon->ses); full_path = build_path_from_dentry(direntry, page); if (IS_ERR(full_path)) { @@ -643,13 +642,9 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, case CIFS_SYMLINK_TYPE_NATIVE: case CIFS_SYMLINK_TYPE_NFS: case CIFS_SYMLINK_TYPE_WSL: - if (server->ops->create_reparse_symlink && - (le32_to_cpu(pTcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS)) { - rc = server->ops->create_reparse_symlink(xid, inode, - direntry, - pTcon, - full_path, - symname); + if (le32_to_cpu(pTcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS) { + rc = create_reparse_symlink(xid, inode, direntry, pTcon, + full_path, symname); goto symlink_exit; } break; diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index 7b6ed9b23e71..da23cc12a52c 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -151,6 +151,12 @@ tcon_info_alloc(bool dir_leases_enabled, enum smb3_tcon_ref_trace trace) #ifdef CONFIG_CIFS_DFS_UPCALL INIT_LIST_HEAD(&ret_buf->dfs_ses_list); #endif + INIT_LIST_HEAD(&ret_buf->pending_opens); + INIT_DELAYED_WORK(&ret_buf->query_interfaces, + smb2_query_server_interfaces); +#ifdef CONFIG_CIFS_DFS_UPCALL + INIT_DELAYED_WORK(&ret_buf->dfs_cache_work, dfs_cache_refresh); +#endif return ret_buf; } @@ -326,6 +332,14 @@ check_smb_hdr(struct smb_hdr *smb) if (smb->Command == SMB_COM_LOCKING_ANDX) return 0; + /* + * Windows NT server returns error resposne (e.g. STATUS_DELETE_PENDING + * or STATUS_OBJECT_NAME_NOT_FOUND or ERRDOS/ERRbadfile or any other) + * for some TRANS2 requests without the RESPONSE flag set in header. + */ + if (smb->Command == SMB_COM_TRANSACTION2 && smb->Status.CifsError != 0) + return 0; + cifs_dbg(VFS, "Server sent request, not response. mid=%u\n", get_mid(smb)); return 1; diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c index 778daf11f1db..52a520349cb7 100644 --- a/fs/smb/client/namespace.c +++ b/fs/smb/client/namespace.c @@ -146,6 +146,9 @@ static char *automount_fullpath(struct dentry *dentry, void *page) } spin_unlock(&tcon->tc_lock); + if (unlikely(!page)) + return ERR_PTR(-ENOMEM); + s = dentry_path_raw(dentry, page, PATH_MAX); if (IS_ERR(s)) return s; diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index f9f11cbf89be..4e5460206397 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -264,7 +264,7 @@ cifs_posix_to_fattr(struct cifs_fattr *fattr, struct smb2_posix_info *info, /* The Mode field in the response can now include the file type as well */ fattr->cf_mode = wire_mode_to_posix(le32_to_cpu(info->Mode), fattr->cf_cifsattrs & ATTR_DIRECTORY); - fattr->cf_dtype = S_DT(le32_to_cpu(info->Mode)); + fattr->cf_dtype = S_DT(fattr->cf_mode); switch (fattr->cf_mode & S_IFMT) { case S_IFLNK: @@ -851,9 +851,9 @@ static bool emit_cached_dirents(struct cached_dirents *cde, } static void update_cached_dirents_count(struct cached_dirents *cde, - struct dir_context *ctx) + struct file *file) { - if (cde->ctx != ctx) + if (cde->file != file) return; if (cde->is_valid || cde->is_failed) return; @@ -862,9 +862,9 @@ static void update_cached_dirents_count(struct cached_dirents *cde, } static void finished_cached_dirents_count(struct cached_dirents *cde, - struct dir_context *ctx) + struct dir_context *ctx, struct file *file) { - if (cde->ctx != ctx) + if (cde->file != file) return; if (cde->is_valid || cde->is_failed) return; @@ -877,11 +877,12 @@ static void finished_cached_dirents_count(struct cached_dirents *cde, static void add_cached_dirent(struct cached_dirents *cde, struct dir_context *ctx, const char *name, int namelen, - struct cifs_fattr *fattr) + struct cifs_fattr *fattr, + struct file *file) { struct cached_dirent *de; - if (cde->ctx != ctx) + if (cde->file != file) return; if (cde->is_valid || cde->is_failed) return; @@ -911,7 +912,8 @@ static void add_cached_dirent(struct cached_dirents *cde, static bool cifs_dir_emit(struct dir_context *ctx, const char *name, int namelen, struct cifs_fattr *fattr, - struct cached_fid *cfid) + struct cached_fid *cfid, + struct file *file) { bool rc; ino_t ino = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid); @@ -923,7 +925,7 @@ static bool cifs_dir_emit(struct dir_context *ctx, if (cfid) { mutex_lock(&cfid->dirents.de_mutex); add_cached_dirent(&cfid->dirents, ctx, name, namelen, - fattr); + fattr, file); mutex_unlock(&cfid->dirents.de_mutex); } @@ -1023,7 +1025,7 @@ static int cifs_filldir(char *find_entry, struct file *file, cifs_prime_dcache(file_dentry(file), &name, &fattr); return !cifs_dir_emit(ctx, name.name, name.len, - &fattr, cfid); + &fattr, cfid, file); } @@ -1074,8 +1076,8 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) * we need to initialize scanning and storing the * directory content. */ - if (ctx->pos == 0 && cfid->dirents.ctx == NULL) { - cfid->dirents.ctx = ctx; + if (ctx->pos == 0 && cfid->dirents.file == NULL) { + cfid->dirents.file = file; cfid->dirents.pos = 2; } /* @@ -1143,7 +1145,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) } else { if (cfid) { mutex_lock(&cfid->dirents.de_mutex); - finished_cached_dirents_count(&cfid->dirents, ctx); + finished_cached_dirents_count(&cfid->dirents, ctx, file); mutex_unlock(&cfid->dirents.de_mutex); } cifs_dbg(FYI, "Could not find entry\n"); @@ -1184,7 +1186,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) ctx->pos++; if (cfid) { mutex_lock(&cfid->dirents.de_mutex); - update_cached_dirents_count(&cfid->dirents, ctx); + update_cached_dirents_count(&cfid->dirents, file); mutex_unlock(&cfid->dirents.de_mutex); } diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index bb25e77c5540..33c1d970747c 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -34,7 +34,7 @@ static int detect_directory_symlink_target(struct cifs_sb_info *cifs_sb, const char *symname, bool *directory); -int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode, +int create_reparse_symlink(const unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, const char *symname) { @@ -57,6 +57,7 @@ static int create_native_symlink(const unsigned int xid, struct inode *inode, struct reparse_symlink_data_buffer *buf = NULL; struct cifs_open_info_data data = {}; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + const char *symroot = cifs_sb->ctx->symlinkroot; struct inode *new; struct kvec iov; __le16 *path = NULL; @@ -82,7 +83,8 @@ static int create_native_symlink(const unsigned int xid, struct inode *inode, .symlink_target = symlink_target, }; - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/') { + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && + symroot && symname[0] == '/') { /* * This is a request to create an absolute symlink on the server * which does not support POSIX paths, and expects symlink in @@ -92,7 +94,7 @@ static int create_native_symlink(const unsigned int xid, struct inode *inode, * ensure compatibility of this symlink stored in absolute form * on the SMB server. */ - if (!strstarts(symname, cifs_sb->ctx->symlinkroot)) { + if (!strstarts(symname, symroot)) { /* * If the absolute Linux symlink target path is not * inside "symlinkroot" location then there is no way @@ -101,12 +103,12 @@ static int create_native_symlink(const unsigned int xid, struct inode *inode, cifs_dbg(VFS, "absolute symlink '%s' cannot be converted to NT format " "because it is outside of symlinkroot='%s'\n", - symname, cifs_sb->ctx->symlinkroot); + symname, symroot); rc = -EINVAL; goto out; } - len = strlen(cifs_sb->ctx->symlinkroot); - if (cifs_sb->ctx->symlinkroot[len-1] != '/') + len = strlen(symroot); + if (symroot[len - 1] != '/') len++; if (symname[len] >= 'a' && symname[len] <= 'z' && (symname[len+1] == '/' || symname[len+1] == '\0')) { @@ -225,7 +227,8 @@ static int create_native_symlink(const unsigned int xid, struct inode *inode, iov.iov_base = buf; iov.iov_len = len; - new = smb2_get_reparse_inode(&data, inode->i_sb, xid, + new = tcon->ses->server->ops->create_reparse_inode( + &data, inode->i_sb, xid, tcon, full_path, directory, &iov, NULL); if (!IS_ERR(new)) @@ -397,7 +400,8 @@ static int create_native_socket(const unsigned int xid, struct inode *inode, struct inode *new; int rc = 0; - new = smb2_get_reparse_inode(&data, inode->i_sb, xid, + new = tcon->ses->server->ops->create_reparse_inode( + &data, inode->i_sb, xid, tcon, full_path, false, &iov, NULL); if (!IS_ERR(new)) d_instantiate(dentry, new); @@ -490,7 +494,8 @@ static int mknod_nfs(unsigned int xid, struct inode *inode, .symlink_target = kstrdup(symname, GFP_KERNEL), }; - new = smb2_get_reparse_inode(&data, inode->i_sb, xid, + new = tcon->ses->server->ops->create_reparse_inode( + &data, inode->i_sb, xid, tcon, full_path, false, &iov, NULL); if (!IS_ERR(new)) d_instantiate(dentry, new); @@ -683,7 +688,8 @@ static int mknod_wsl(unsigned int xid, struct inode *inode, memcpy(data.wsl.eas, &cc->ea, len); data.wsl.eas_len = len; - new = smb2_get_reparse_inode(&data, inode->i_sb, + new = tcon->ses->server->ops->create_reparse_inode( + &data, inode->i_sb, xid, tcon, full_path, false, &reparse_iov, &xattr_iov); if (!IS_ERR(new)) @@ -696,7 +702,7 @@ static int mknod_wsl(unsigned int xid, struct inode *inode, return rc; } -int smb2_mknod_reparse(unsigned int xid, struct inode *inode, +int mknod_reparse(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, umode_t mode, dev_t dev) { @@ -782,6 +788,7 @@ int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len, const char *full_path, struct cifs_sb_info *cifs_sb) { + const char *symroot = cifs_sb->ctx->symlinkroot; char sep = CIFS_DIR_SEP(cifs_sb); char *linux_target = NULL; char *smb_target = NULL; @@ -815,7 +822,8 @@ int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len, goto out; } - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && !relative) { + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && + symroot && !relative) { /* * This is an absolute symlink from the server which does not * support POSIX paths, so the symlink is in NT-style path. @@ -875,15 +883,8 @@ globalroot: abs_path += sizeof("\\DosDevices\\")-1; else if (strstarts(abs_path, "\\GLOBAL??\\")) abs_path += sizeof("\\GLOBAL??\\")-1; - else { - /* Unhandled absolute symlink, points outside of DOS/Win32 */ - cifs_dbg(VFS, - "absolute symlink '%s' cannot be converted from NT format " - "because points to unknown target\n", - smb_target); - rc = -EIO; - goto out; - } + else + goto out_unhandled_target; /* Sometimes path separator after \?? is double backslash */ if (abs_path[0] == '\\') @@ -910,25 +911,19 @@ globalroot: abs_path++; abs_path[0] = drive_letter; } else { - /* Unhandled absolute symlink. Report an error. */ - cifs_dbg(VFS, - "absolute symlink '%s' cannot be converted from NT format " - "because points to unknown target\n", - smb_target); - rc = -EIO; - goto out; + goto out_unhandled_target; } abs_path_len = strlen(abs_path)+1; - symlinkroot_len = strlen(cifs_sb->ctx->symlinkroot); - if (cifs_sb->ctx->symlinkroot[symlinkroot_len-1] == '/') + symlinkroot_len = strlen(symroot); + if (symroot[symlinkroot_len - 1] == '/') symlinkroot_len--; linux_target = kmalloc(symlinkroot_len + 1 + abs_path_len, GFP_KERNEL); if (!linux_target) { rc = -ENOMEM; goto out; } - memcpy(linux_target, cifs_sb->ctx->symlinkroot, symlinkroot_len); + memcpy(linux_target, symroot, symlinkroot_len); linux_target[symlinkroot_len] = '/'; memcpy(linux_target + symlinkroot_len + 1, abs_path, abs_path_len); } else if (smb_target[0] == sep && relative) { @@ -966,6 +961,7 @@ globalroot: * These paths have same format as Linux symlinks, so no * conversion is needed. */ +out_unhandled_target: linux_target = smb_target; smb_target = NULL; } @@ -1172,7 +1168,6 @@ out: if (!have_xattr_dev && (tag == IO_REPARSE_TAG_LX_CHR || tag == IO_REPARSE_TAG_LX_BLK)) return false; - fattr->cf_dtype = S_DT(fattr->cf_mode); return true; } diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h index 08de853b36a8..66269c10beba 100644 --- a/fs/smb/client/reparse.h +++ b/fs/smb/client/reparse.h @@ -129,10 +129,10 @@ static inline bool cifs_open_data_reparse(struct cifs_open_info_data *data) bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct cifs_open_info_data *data); -int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode, +int create_reparse_symlink(const unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, const char *symname); -int smb2_mknod_reparse(unsigned int xid, struct inode *inode, +int mknod_reparse(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, umode_t mode, dev_t dev); struct reparse_data_buffer *smb2_get_reparse_point_buffer(const struct kvec *rsp_iov, u32 *len); diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index b3fa9ee26912..0a8c2fcc9ded 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -332,6 +332,7 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) struct cifs_server_iface *old_iface = NULL; struct cifs_server_iface *last_iface = NULL; struct sockaddr_storage ss; + int retry = 0; spin_lock(&ses->chan_lock); chan_index = cifs_ses_get_chan_index(ses, server); @@ -360,6 +361,7 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) return; } +try_again: last_iface = list_last_entry(&ses->iface_list, struct cifs_server_iface, iface_head); iface_min_speed = last_iface->speed; @@ -397,6 +399,13 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) } if (list_entry_is_head(iface, &ses->iface_list, iface_head)) { + list_for_each_entry(iface, &ses->iface_list, iface_head) + iface->weight_fulfilled = 0; + + /* see if it can be satisfied in second attempt */ + if (!retry++) + goto try_again; + iface = NULL; cifs_dbg(FYI, "unable to find a suitable iface\n"); } @@ -445,6 +454,10 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) ses->chans[chan_index].iface = iface; spin_unlock(&ses->chan_lock); + + spin_lock(&server->srv_lock); + memcpy(&server->dstaddr, &iface->sockaddr, sizeof(server->dstaddr)); + spin_unlock(&server->srv_lock); } static int @@ -494,8 +507,7 @@ cifs_ses_add_channel(struct cifs_ses *ses, ctx->domainauto = ses->domainAuto; ctx->domainname = ses->domainName; - /* no hostname for extra channels */ - ctx->server_hostname = ""; + ctx->server_hostname = ses->server->hostname; ctx->username = ses->user_name; ctx->password = ses->password; @@ -628,6 +640,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, USHRT_MAX)); pSMB->req.MaxMpxCount = cpu_to_le16(server->maxReq); pSMB->req.VcNumber = cpu_to_le16(1); + pSMB->req.SessionKey = server->session_key_id; /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */ @@ -1684,22 +1697,22 @@ _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data) pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; capabilities = cifs_ssetup_hdr(ses, server, pSMB); - if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { - cifs_dbg(VFS, "NTLMSSP requires Unicode support\n"); - return -ENOSYS; - } - pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; capabilities |= CAP_EXTENDED_SECURITY; pSMB->req.Capabilities |= cpu_to_le32(capabilities); bcc_ptr = sess_data->iov[2].iov_base; - /* unicode strings must be word aligned */ - if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) { - *bcc_ptr = 0; - bcc_ptr++; + + if (pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) { + /* unicode strings must be word aligned */ + if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) { + *bcc_ptr = 0; + bcc_ptr++; + } + unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp); + } else { + ascii_oslm_strings(&bcc_ptr, sess_data->nls_cp); } - unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp); sess_data->iov[2].iov_len = (long) bcc_ptr - (long) sess_data->iov[2].iov_base; diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index b27a182629ec..e364b6515af3 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -16,6 +16,7 @@ #include "fs_context.h" #include "nterr.h" #include "smberr.h" +#include "reparse.h" /* * An NT cancel request header looks just like the original request except: @@ -1263,17 +1264,26 @@ cifs_make_node(unsigned int xid, struct inode *inode, if (rc == 0) d_instantiate(dentry, newinode); return rc; + } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { + /* + * Check if mounted with mount parm 'sfu' mount parm. + * SFU emulation should work with all servers + * and was used by default in earlier versions of Windows. + */ + return cifs_sfu_make_node(xid, inode, dentry, tcon, + full_path, mode, dev); + } else if (le32_to_cpu(tcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS) { + /* + * mknod via reparse points requires server support for + * storing reparse points, which is available since + * Windows 2000, but was not widely used until release + * of Windows Server 2012 by the Windows NFS server. + */ + return mknod_reparse(xid, inode, dentry, tcon, + full_path, mode, dev); + } else { + return -EOPNOTSUPP; } - /* - * Check if mounted with mount parm 'sfu' mount parm. - * SFU emulation should work with all servers, but only - * supports block and char device, socket & fifo, - * and was used by default in earlier versions of Windows - */ - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) - return -EPERM; - return cifs_sfu_make_node(xid, inode, dentry, tcon, - full_path, mode, dev); } static bool @@ -1370,6 +1380,7 @@ struct smb_version_operations smb1_operations = { .create_hardlink = CIFSCreateHardLink, .query_symlink = cifs_query_symlink, .get_reparse_point_buffer = cifs_get_reparse_point_buffer, + .create_reparse_inode = cifs_create_reparse_inode, .open = cifs_open_file, .set_fid = cifs_set_fid, .close = cifs_close_file, diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 2a3e46b8e15a..69d251726c02 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -1058,10 +1058,11 @@ int smb2_query_path_info(const unsigned int xid, * Skip SMB2_OP_GET_REPARSE if symlink already parsed in create * response. */ - if (data->reparse.tag != IO_REPARSE_TAG_SYMLINK) + if (data->reparse.tag != IO_REPARSE_TAG_SYMLINK) { cmds[num_cmds++] = SMB2_OP_GET_REPARSE; - if (!tcon->posix_extensions) - cmds[num_cmds++] = SMB2_OP_QUERY_WSL_EA; + if (!tcon->posix_extensions) + cmds[num_cmds++] = SMB2_OP_QUERY_WSL_EA; + } oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, FILE_READ_ATTRIBUTES | @@ -1320,7 +1321,7 @@ smb2_set_file_info(struct inode *inode, const char *full_path, return rc; } -struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, +struct inode *smb2_create_reparse_inode(struct cifs_open_info_data *data, struct super_block *sb, const unsigned int xid, struct cifs_tcon *tcon, @@ -1346,7 +1347,8 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, * empty object on the server. */ if (!(le32_to_cpu(tcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS)) - return ERR_PTR(-EOPNOTSUPP); + if (!tcon->posix_extensions) + return ERR_PTR(-EOPNOTSUPP); oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, SYNCHRONIZE | DELETE | diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 2fe8eeb98535..1b4a31894f43 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -504,6 +504,9 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) wsize = min_t(unsigned int, wsize, server->max_write); #ifdef CONFIG_CIFS_SMB_DIRECT if (server->rdma) { + struct smbdirect_socket_parameters *sp = + &server->smbd_conn->socket.parameters; + if (server->sign) /* * Account for SMB2 data transfer packet header and @@ -511,12 +514,12 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) */ wsize = min_t(unsigned int, wsize, - server->smbd_conn->max_fragmented_send_size - + sp->max_fragmented_send_size - SMB2_READWRITE_PDU_HEADER_SIZE - sizeof(struct smb2_transform_hdr)); else wsize = min_t(unsigned int, - wsize, server->smbd_conn->max_readwrite_size); + wsize, sp->max_read_write_size); } #endif if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) @@ -552,6 +555,9 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) rsize = min_t(unsigned int, rsize, server->max_read); #ifdef CONFIG_CIFS_SMB_DIRECT if (server->rdma) { + struct smbdirect_socket_parameters *sp = + &server->smbd_conn->socket.parameters; + if (server->sign) /* * Account for SMB2 data transfer packet header and @@ -559,12 +565,12 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) */ rsize = min_t(unsigned int, rsize, - server->smbd_conn->max_fragmented_recv_size - + sp->max_fragmented_recv_size - SMB2_READWRITE_PDU_HEADER_SIZE - sizeof(struct smb2_transform_hdr)); else rsize = min_t(unsigned int, - rsize, server->smbd_conn->max_readwrite_size); + rsize, sp->max_read_write_size); } #endif @@ -4069,7 +4075,7 @@ map_oplock_to_lease(u8 oplock) } static char * -smb2_create_lease_buf(u8 *lease_key, u8 oplock) +smb2_create_lease_buf(u8 *lease_key, u8 oplock, u8 *parent_lease_key, __le32 flags) { struct create_lease *buf; @@ -4095,7 +4101,7 @@ smb2_create_lease_buf(u8 *lease_key, u8 oplock) } static char * -smb3_create_lease_buf(u8 *lease_key, u8 oplock) +smb3_create_lease_buf(u8 *lease_key, u8 oplock, u8 *parent_lease_key, __le32 flags) { struct create_lease_v2 *buf; @@ -4105,6 +4111,9 @@ smb3_create_lease_buf(u8 *lease_key, u8 oplock) memcpy(&buf->lcontext.LeaseKey, lease_key, SMB2_LEASE_KEY_SIZE); buf->lcontext.LeaseState = map_oplock_to_lease(oplock); + buf->lcontext.LeaseFlags = flags; + if (flags & SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE) + memcpy(&buf->lcontext.ParentLeaseKey, parent_lease_key, SMB2_LEASE_KEY_SIZE); buf->ccontext.DataOffset = cpu_to_le16(offsetof (struct create_lease_v2, lcontext)); @@ -4307,6 +4316,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, u8 key[SMB3_ENC_DEC_KEY_SIZE]; struct aead_request *req; u8 *iv; + DECLARE_CRYPTO_WAIT(wait); unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize); void *creq; size_t sensitive_size; @@ -4357,7 +4367,11 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, aead_request_set_crypt(req, sg, sg, crypt_len, iv); aead_request_set_ad(req, assoc_data_len); - rc = enc ? crypto_aead_encrypt(req) : crypto_aead_decrypt(req); + aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &wait); + + rc = crypto_wait_req(enc ? crypto_aead_encrypt(req) + : crypto_aead_decrypt(req), &wait); if (!rc && enc) memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE); @@ -5246,8 +5260,9 @@ static int smb2_make_node(unsigned int xid, struct inode *inode, if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { rc = cifs_sfu_make_node(xid, inode, dentry, tcon, full_path, mode, dev); - } else if (le32_to_cpu(tcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS) { - rc = smb2_mknod_reparse(xid, inode, dentry, tcon, + } else if ((le32_to_cpu(tcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS) + || (tcon->posix_extensions)) { + rc = mknod_reparse(xid, inode, dentry, tcon, full_path, mode, dev); } return rc; @@ -5306,7 +5321,7 @@ struct smb_version_operations smb20_operations = { .get_reparse_point_buffer = smb2_get_reparse_point_buffer, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, - .create_reparse_symlink = smb2_create_reparse_symlink, + .create_reparse_inode = smb2_create_reparse_inode, .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, @@ -5409,7 +5424,7 @@ struct smb_version_operations smb21_operations = { .get_reparse_point_buffer = smb2_get_reparse_point_buffer, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, - .create_reparse_symlink = smb2_create_reparse_symlink, + .create_reparse_inode = smb2_create_reparse_inode, .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, @@ -5516,7 +5531,7 @@ struct smb_version_operations smb30_operations = { .get_reparse_point_buffer = smb2_get_reparse_point_buffer, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, - .create_reparse_symlink = smb2_create_reparse_symlink, + .create_reparse_inode = smb2_create_reparse_inode, .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, @@ -5632,7 +5647,7 @@ struct smb_version_operations smb311_operations = { .get_reparse_point_buffer = smb2_get_reparse_point_buffer, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, - .create_reparse_symlink = smb2_create_reparse_symlink, + .create_reparse_inode = smb2_create_reparse_inode, .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 4e28632b5fd6..2df93a75e3b8 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -36,6 +36,7 @@ #include "smb2glob.h" #include "cifspdu.h" #include "cifs_spnego.h" +#include "../common/smbdirect/smbdirect.h" #include "smbdirect.h" #include "trace.h" #ifdef CONFIG_CIFS_DFS_UPCALL @@ -411,14 +412,23 @@ skip_sess_setup: if (!rc && (server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL) && server->ops->query_server_interfaces) { - mutex_unlock(&ses->session_mutex); - /* - * query server network interfaces, in case they change + * query server network interfaces, in case they change. + * Also mark the session as pending this update while the query + * is in progress. This will be used to avoid calling + * smb2_reconnect recursively. */ + ses->flags |= CIFS_SES_FLAGS_PENDING_QUERY_INTERFACES; xid = get_xid(); rc = server->ops->query_server_interfaces(xid, tcon, false); free_xid(xid); + ses->flags &= ~CIFS_SES_FLAGS_PENDING_QUERY_INTERFACES; + + if (!tcon->ipc && !tcon->dummy) + queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, + (SMB_INTERFACE_POLL_INTERVAL * HZ)); + + mutex_unlock(&ses->session_mutex); if (rc == -EOPNOTSUPP && ses->chan_count > 1) { /* @@ -438,11 +448,8 @@ skip_sess_setup: if (ses->chan_max > ses->chan_count && ses->iface_count && !SERVER_IS_CHAN(server)) { - if (ses->chan_count == 1) { + if (ses->chan_count == 1) cifs_server_dbg(VFS, "supports multichannel now\n"); - queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, - (SMB_INTERFACE_POLL_INTERVAL * HZ)); - } cifs_try_adding_channels(ses); } @@ -560,11 +567,18 @@ static int smb2_ioctl_req_init(u32 opcode, struct cifs_tcon *tcon, struct TCP_Server_Info *server, void **request_buf, unsigned int *total_len) { - /* Skip reconnect only for FSCTL_VALIDATE_NEGOTIATE_INFO IOCTLs */ - if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) { + /* + * Skip reconnect in one of the following cases: + * 1. For FSCTL_VALIDATE_NEGOTIATE_INFO IOCTLs + * 2. For FSCTL_QUERY_NETWORK_INTERFACE_INFO IOCTL when called from + * smb2_reconnect (indicated by CIFS_SES_FLAG_SCALE_CHANNELS ses flag) + */ + if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO || + (opcode == FSCTL_QUERY_NETWORK_INTERFACE_INFO && + (tcon->ses->flags & CIFS_SES_FLAGS_PENDING_QUERY_INTERFACES))) return __smb2_plain_req_init(SMB2_IOCTL, tcon, server, request_buf, total_len); - } + return smb2_plain_req_init(SMB2_IOCTL, tcon, server, request_buf, total_len); } @@ -2392,11 +2406,16 @@ static int add_lease_context(struct TCP_Server_Info *server, struct smb2_create_req *req, struct kvec *iov, - unsigned int *num_iovec, u8 *lease_key, __u8 *oplock) + unsigned int *num_iovec, + u8 *lease_key, + __u8 *oplock, + u8 *parent_lease_key, + __le32 flags) { unsigned int num = *num_iovec; - iov[num].iov_base = server->ops->create_lease_buf(lease_key, *oplock); + iov[num].iov_base = server->ops->create_lease_buf(lease_key, *oplock, + parent_lease_key, flags); if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = server->vals->create_lease_size; @@ -3069,7 +3088,9 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, req->RequestedOplockLevel = *oplock; /* no srv lease support */ else { rc = add_lease_context(server, req, iov, &n_iov, - oparms->fid->lease_key, oplock); + oparms->fid->lease_key, oplock, + oparms->fid->parent_lease_key, + oparms->lease_flags); if (rc) return rc; } @@ -4208,10 +4229,8 @@ void smb2_reconnect_server(struct work_struct *work) } goto done; } - tcon->status = TID_GOOD; - tcon->retry = false; - tcon->need_reconnect = false; + tcon->dummy = true; /* now reconnect sessions for necessary channels */ list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) { @@ -4442,10 +4461,10 @@ smb2_new_read_req(void **buf, unsigned int *total_len, #ifdef CONFIG_CIFS_SMB_DIRECT /* * If we want to do a RDMA write, fill in and append - * smbd_buffer_descriptor_v1 to the end of read request + * smbdirect_buffer_descriptor_v1 to the end of read request */ if (rdata && smb3_use_rdma_offload(io_parms)) { - struct smbd_buffer_descriptor_v1 *v1; + struct smbdirect_buffer_descriptor_v1 *v1; bool need_invalidate = server->dialect == SMB30_PROT_ID; rdata->mr = smbd_register_mr(server->smbd_conn, &rdata->subreq.io_iter, @@ -4459,8 +4478,8 @@ smb2_new_read_req(void **buf, unsigned int *total_len, req->ReadChannelInfoOffset = cpu_to_le16(offsetof(struct smb2_read_req, Buffer)); req->ReadChannelInfoLength = - cpu_to_le16(sizeof(struct smbd_buffer_descriptor_v1)); - v1 = (struct smbd_buffer_descriptor_v1 *) &req->Buffer[0]; + cpu_to_le16(sizeof(struct smbdirect_buffer_descriptor_v1)); + v1 = (struct smbdirect_buffer_descriptor_v1 *) &req->Buffer[0]; v1->offset = cpu_to_le64(rdata->mr->mr->iova); v1->token = cpu_to_le32(rdata->mr->mr->rkey); v1->length = cpu_to_le32(rdata->mr->mr->length); @@ -4546,7 +4565,11 @@ smb2_readv_callback(struct mid_q_entry *mid) cifs_stats_bytes_read(tcon, rdata->got_bytes); break; case MID_REQUEST_SUBMITTED: + trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_req_submitted); + goto do_retry; case MID_RETRY_NEEDED: + trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_retry_needed); +do_retry: __set_bit(NETFS_SREQ_NEED_RETRY, &rdata->subreq.flags); rdata->result = -EAGAIN; if (server->sign && rdata->got_bytes) @@ -4557,11 +4580,15 @@ smb2_readv_callback(struct mid_q_entry *mid) cifs_stats_bytes_read(tcon, rdata->got_bytes); break; case MID_RESPONSE_MALFORMED: + trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_malformed); credits.value = le16_to_cpu(shdr->CreditRequest); credits.instance = server->reconnect_instance; - fallthrough; + rdata->result = -EIO; + break; default: + trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_unknown); rdata->result = -EIO; + break; } #ifdef CONFIG_CIFS_SMB_DIRECT /* @@ -4814,11 +4841,14 @@ smb2_writev_callback(struct mid_q_entry *mid) switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: + trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_progress); credits.value = le16_to_cpu(rsp->hdr.CreditRequest); credits.instance = server->reconnect_instance; result = smb2_check_receive(mid, server, 0); - if (result != 0) + if (result != 0) { + trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_bad); break; + } written = le32_to_cpu(rsp->DataLength); /* @@ -4840,14 +4870,23 @@ smb2_writev_callback(struct mid_q_entry *mid) } break; case MID_REQUEST_SUBMITTED: + trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_req_submitted); + __set_bit(NETFS_SREQ_NEED_RETRY, &wdata->subreq.flags); + result = -EAGAIN; + break; case MID_RETRY_NEEDED: + trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_retry_needed); + __set_bit(NETFS_SREQ_NEED_RETRY, &wdata->subreq.flags); result = -EAGAIN; break; case MID_RESPONSE_MALFORMED: + trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_malformed); credits.value = le16_to_cpu(rsp->hdr.CreditRequest); credits.instance = server->reconnect_instance; - fallthrough; + result = -EIO; + break; default: + trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_unknown); result = -EIO; break; } @@ -4887,8 +4926,7 @@ smb2_writev_callback(struct mid_q_entry *mid) server->credits, server->in_flight, 0, cifs_trace_rw_credits_write_response_clear); wdata->credits.value = 0; - trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_progress); - cifs_write_subrequest_terminated(wdata, result ?: written, true); + cifs_write_subrequest_terminated(wdata, result ?: written); release_mid(mid); trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, 0, server->credits, server->in_flight, @@ -4968,10 +5006,10 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) #ifdef CONFIG_CIFS_SMB_DIRECT /* * If we want to do a server RDMA read, fill in and append - * smbd_buffer_descriptor_v1 to the end of write request + * smbdirect_buffer_descriptor_v1 to the end of write request */ if (smb3_use_rdma_offload(io_parms)) { - struct smbd_buffer_descriptor_v1 *v1; + struct smbdirect_buffer_descriptor_v1 *v1; bool need_invalidate = server->dialect == SMB30_PROT_ID; wdata->mr = smbd_register_mr(server->smbd_conn, &wdata->subreq.io_iter, @@ -4990,8 +5028,8 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) req->WriteChannelInfoOffset = cpu_to_le16(offsetof(struct smb2_write_req, Buffer)); req->WriteChannelInfoLength = - cpu_to_le16(sizeof(struct smbd_buffer_descriptor_v1)); - v1 = (struct smbd_buffer_descriptor_v1 *) &req->Buffer[0]; + cpu_to_le16(sizeof(struct smbdirect_buffer_descriptor_v1)); + v1 = (struct smbdirect_buffer_descriptor_v1 *) &req->Buffer[0]; v1->offset = cpu_to_le64(wdata->mr->mr->iova); v1->token = cpu_to_le32(wdata->mr->mr->rkey); v1->length = cpu_to_le32(wdata->mr->mr->length); @@ -5061,7 +5099,7 @@ out: -(int)wdata->credits.value, cifs_trace_rw_credits_write_response_clear); add_credits_and_wake_if(wdata->server, &wdata->credits, 0); - cifs_write_subrequest_terminated(wdata, rc, true); + cifs_write_subrequest_terminated(wdata, rc); } } @@ -5917,71 +5955,6 @@ posix_qfsinf_exit: } int -SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, struct kstatfs *fsdata) -{ - struct smb_rqst rqst; - struct smb2_query_info_rsp *rsp = NULL; - struct kvec iov; - struct kvec rsp_iov; - int rc = 0; - int resp_buftype; - struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server; - struct smb2_fs_full_size_info *info = NULL; - int flags = 0; - int retries = 0, cur_sleep = 1; - -replay_again: - /* reinitialize for possible replay */ - flags = 0; - server = cifs_pick_channel(ses); - - rc = build_qfs_info_req(&iov, tcon, server, - FS_FULL_SIZE_INFORMATION, - sizeof(struct smb2_fs_full_size_info), - persistent_fid, volatile_fid); - if (rc) - return rc; - - if (smb3_encryption_required(tcon)) - flags |= CIFS_TRANSFORM_REQ; - - memset(&rqst, 0, sizeof(struct smb_rqst)); - rqst.rq_iov = &iov; - rqst.rq_nvec = 1; - - if (retries) - smb2_set_replay(server, &rqst); - - rc = cifs_send_recv(xid, ses, server, - &rqst, &resp_buftype, flags, &rsp_iov); - free_qfs_info_req(&iov); - if (rc) { - cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); - goto qfsinf_exit; - } - rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; - - info = (struct smb2_fs_full_size_info *)( - le16_to_cpu(rsp->OutputBufferOffset) + (char *)rsp); - rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset), - le32_to_cpu(rsp->OutputBufferLength), &rsp_iov, - sizeof(struct smb2_fs_full_size_info)); - if (!rc) - smb2_copy_fs_info_to_kstatfs(info, fsdata); - -qfsinf_exit: - free_rsp_buf(resp_buftype, rsp_iov.iov_base); - - if (is_replayable_error(rc) && - smb2_should_replay(tcon, &retries, &cur_sleep)) - goto replay_again; - - return rc; -} - -int SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, int level) { diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index 4662c7e2d259..6e805ece6a7b 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -54,7 +54,7 @@ extern int smb3_handle_read_data(struct TCP_Server_Info *server, extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *path, __u32 *reparse_tag); -struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, +struct inode *smb2_create_reparse_inode(struct cifs_open_info_data *data, struct super_block *sb, const unsigned int xid, struct cifs_tcon *tcon, @@ -259,9 +259,6 @@ extern int smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 volatile_fid); extern int smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server); void smb2_cancelled_close_fid(struct work_struct *work); -extern int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_file_id, u64 volatile_file_id, - struct kstatfs *FSData); extern int SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id, struct kstatfs *FSData); @@ -317,9 +314,6 @@ int smb311_posix_query_path_info(const unsigned int xid, int posix_info_parse(const void *beg, const void *end, struct smb2_posix_info_parsed *out); int posix_info_sid_size(const void *beg, const void *end); -int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode, - struct dentry *dentry, struct cifs_tcon *tcon, - const char *full_path, const char *symname); int smb2_make_nfs_node(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, umode_t mode, dev_t dev); diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index b0b7254661e9..754e94a0e07f 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -7,6 +7,7 @@ #include <linux/module.h> #include <linux/highmem.h> #include <linux/folio_queue.h> +#include "../common/smbdirect/smbdirect_pdu.h" #include "smbdirect.h" #include "cifs_debug.h" #include "cifsproto.h" @@ -50,9 +51,6 @@ struct smb_extract_to_rdma { static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len, struct smb_extract_to_rdma *rdma); -/* SMBD version number */ -#define SMBD_V1 0x0100 - /* Port numbers for SMBD transport */ #define SMB_PORT 445 #define SMBD_PORT 5445 @@ -165,10 +163,11 @@ static void smbd_disconnect_rdma_work(struct work_struct *work) { struct smbd_connection *info = container_of(work, struct smbd_connection, disconnect_work); + struct smbdirect_socket *sc = &info->socket; - if (info->transport_status == SMBD_CONNECTED) { - info->transport_status = SMBD_DISCONNECTING; - rdma_disconnect(info->id); + if (sc->status == SMBDIRECT_SOCKET_CONNECTED) { + sc->status = SMBDIRECT_SOCKET_DISCONNECTING; + rdma_disconnect(sc->rdma.cm_id); } } @@ -182,6 +181,7 @@ static int smbd_conn_upcall( struct rdma_cm_id *id, struct rdma_cm_event *event) { struct smbd_connection *info = id->context; + struct smbdirect_socket *sc = &info->socket; log_rdma_event(INFO, "event=%d status=%d\n", event->event, event->status); @@ -205,7 +205,7 @@ static int smbd_conn_upcall( case RDMA_CM_EVENT_ESTABLISHED: log_rdma_event(INFO, "connected event=%d\n", event->event); - info->transport_status = SMBD_CONNECTED; + sc->status = SMBDIRECT_SOCKET_CONNECTED; wake_up_interruptible(&info->conn_wait); break; @@ -213,20 +213,20 @@ static int smbd_conn_upcall( case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_REJECTED: log_rdma_event(INFO, "connecting failed event=%d\n", event->event); - info->transport_status = SMBD_DISCONNECTED; + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; wake_up_interruptible(&info->conn_wait); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_DISCONNECTED: /* This happens when we fail the negotiation */ - if (info->transport_status == SMBD_NEGOTIATE_FAILED) { - info->transport_status = SMBD_DISCONNECTED; + if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_FAILED) { + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; wake_up(&info->conn_wait); break; } - info->transport_status = SMBD_DISCONNECTED; + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; wake_up_interruptible(&info->disconn_wait); wake_up_interruptible(&info->wait_reassembly_queue); wake_up_interruptible_all(&info->wait_send_queue); @@ -275,6 +275,8 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) int i; struct smbd_request *request = container_of(wc->wr_cqe, struct smbd_request, cqe); + struct smbd_connection *info = request->info; + struct smbdirect_socket *sc = &info->socket; log_rdma_send(INFO, "smbd_request 0x%p completed wc->status=%d\n", request, wc->status); @@ -286,7 +288,7 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) } for (i = 0; i < request->num_sge; i++) - ib_dma_unmap_single(request->info->id->device, + ib_dma_unmap_single(sc->ib.dev, request->sge[i].addr, request->sge[i].length, DMA_TO_DEVICE); @@ -299,7 +301,7 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) mempool_free(request, request->info->request_mempool); } -static void dump_smbd_negotiate_resp(struct smbd_negotiate_resp *resp) +static void dump_smbdirect_negotiate_resp(struct smbdirect_negotiate_resp *resp) { log_rdma_event(INFO, "resp message min_version %u max_version %u negotiated_version %u credits_requested %u credits_granted %u status %u max_readwrite_size %u preferred_send_size %u max_receive_size %u max_fragmented_size %u\n", resp->min_version, resp->max_version, @@ -318,15 +320,17 @@ static bool process_negotiation_response( struct smbd_response *response, int packet_length) { struct smbd_connection *info = response->info; - struct smbd_negotiate_resp *packet = smbd_response_payload(response); + struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_negotiate_resp *packet = smbd_response_payload(response); - if (packet_length < sizeof(struct smbd_negotiate_resp)) { + if (packet_length < sizeof(struct smbdirect_negotiate_resp)) { log_rdma_event(ERR, "error: packet_length=%d\n", packet_length); return false; } - if (le16_to_cpu(packet->negotiated_version) != SMBD_V1) { + if (le16_to_cpu(packet->negotiated_version) != SMBDIRECT_V1) { log_rdma_event(ERR, "error: negotiated_version=%x\n", le16_to_cpu(packet->negotiated_version)); return false; @@ -347,20 +351,20 @@ static bool process_negotiation_response( atomic_set(&info->receive_credits, 0); - if (le32_to_cpu(packet->preferred_send_size) > info->max_receive_size) { + if (le32_to_cpu(packet->preferred_send_size) > sp->max_recv_size) { log_rdma_event(ERR, "error: preferred_send_size=%d\n", le32_to_cpu(packet->preferred_send_size)); return false; } - info->max_receive_size = le32_to_cpu(packet->preferred_send_size); + sp->max_recv_size = le32_to_cpu(packet->preferred_send_size); if (le32_to_cpu(packet->max_receive_size) < SMBD_MIN_RECEIVE_SIZE) { log_rdma_event(ERR, "error: max_receive_size=%d\n", le32_to_cpu(packet->max_receive_size)); return false; } - info->max_send_size = min_t(int, info->max_send_size, - le32_to_cpu(packet->max_receive_size)); + sp->max_send_size = min_t(u32, sp->max_send_size, + le32_to_cpu(packet->max_receive_size)); if (le32_to_cpu(packet->max_fragmented_size) < SMBD_MIN_FRAGMENTED_SIZE) { @@ -368,18 +372,18 @@ static bool process_negotiation_response( le32_to_cpu(packet->max_fragmented_size)); return false; } - info->max_fragmented_send_size = + sp->max_fragmented_send_size = le32_to_cpu(packet->max_fragmented_size); info->rdma_readwrite_threshold = - rdma_readwrite_threshold > info->max_fragmented_send_size ? - info->max_fragmented_send_size : + rdma_readwrite_threshold > sp->max_fragmented_send_size ? + sp->max_fragmented_send_size : rdma_readwrite_threshold; - info->max_readwrite_size = min_t(u32, + sp->max_read_write_size = min_t(u32, le32_to_cpu(packet->max_readwrite_size), info->max_frmr_depth * PAGE_SIZE); - info->max_frmr_depth = info->max_readwrite_size / PAGE_SIZE; + info->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE; return true; } @@ -393,8 +397,9 @@ static void smbd_post_send_credits(struct work_struct *work) struct smbd_connection *info = container_of(work, struct smbd_connection, post_send_credits_work); + struct smbdirect_socket *sc = &info->socket; - if (info->transport_status != SMBD_CONNECTED) { + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { wake_up(&info->wait_receive_queues); return; } @@ -448,7 +453,7 @@ static void smbd_post_send_credits(struct work_struct *work) /* Called from softirq, when recv is done */ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) { - struct smbd_data_transfer *data_transfer; + struct smbdirect_data_transfer *data_transfer; struct smbd_response *response = container_of(wc->wr_cqe, struct smbd_response, cqe); struct smbd_connection *info = response->info; @@ -474,7 +479,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) switch (response->type) { /* SMBD negotiation response */ case SMBD_NEGOTIATE_RESP: - dump_smbd_negotiate_resp(smbd_response_payload(response)); + dump_smbdirect_negotiate_resp(smbd_response_payload(response)); info->full_packet_received = true; info->negotiate_done = process_negotiation_response(response, wc->byte_len); @@ -531,7 +536,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) /* Send a KEEP_ALIVE response right away if requested */ info->keep_alive_requested = KEEP_ALIVE_NONE; if (le16_to_cpu(data_transfer->flags) & - SMB_DIRECT_RESPONSE_REQUESTED) { + SMBDIRECT_FLAG_RESPONSE_REQUESTED) { info->keep_alive_requested = KEEP_ALIVE_PENDING; } @@ -635,32 +640,34 @@ static int smbd_ia_open( struct smbd_connection *info, struct sockaddr *dstaddr, int port) { + struct smbdirect_socket *sc = &info->socket; int rc; - info->id = smbd_create_id(info, dstaddr, port); - if (IS_ERR(info->id)) { - rc = PTR_ERR(info->id); + sc->rdma.cm_id = smbd_create_id(info, dstaddr, port); + if (IS_ERR(sc->rdma.cm_id)) { + rc = PTR_ERR(sc->rdma.cm_id); goto out1; } + sc->ib.dev = sc->rdma.cm_id->device; - if (!frwr_is_supported(&info->id->device->attrs)) { + if (!frwr_is_supported(&sc->ib.dev->attrs)) { log_rdma_event(ERR, "Fast Registration Work Requests (FRWR) is not supported\n"); log_rdma_event(ERR, "Device capability flags = %llx max_fast_reg_page_list_len = %u\n", - info->id->device->attrs.device_cap_flags, - info->id->device->attrs.max_fast_reg_page_list_len); + sc->ib.dev->attrs.device_cap_flags, + sc->ib.dev->attrs.max_fast_reg_page_list_len); rc = -EPROTONOSUPPORT; goto out2; } info->max_frmr_depth = min_t(int, smbd_max_frmr_depth, - info->id->device->attrs.max_fast_reg_page_list_len); + sc->ib.dev->attrs.max_fast_reg_page_list_len); info->mr_type = IB_MR_TYPE_MEM_REG; - if (info->id->device->attrs.kernel_cap_flags & IBK_SG_GAPS_REG) + if (sc->ib.dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG) info->mr_type = IB_MR_TYPE_SG_GAPS; - info->pd = ib_alloc_pd(info->id->device, 0); - if (IS_ERR(info->pd)) { - rc = PTR_ERR(info->pd); + sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0); + if (IS_ERR(sc->ib.pd)) { + rc = PTR_ERR(sc->ib.pd); log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc); goto out2; } @@ -668,8 +675,8 @@ static int smbd_ia_open( return 0; out2: - rdma_destroy_id(info->id); - info->id = NULL; + rdma_destroy_id(sc->rdma.cm_id); + sc->rdma.cm_id = NULL; out1: return rc; @@ -683,10 +690,12 @@ out1: */ static int smbd_post_send_negotiate_req(struct smbd_connection *info) { + struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; struct ib_send_wr send_wr; int rc = -ENOMEM; struct smbd_request *request; - struct smbd_negotiate_req *packet; + struct smbdirect_negotiate_req *packet; request = mempool_alloc(info->request_mempool, GFP_KERNEL); if (!request) @@ -695,29 +704,29 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info) request->info = info; packet = smbd_request_payload(request); - packet->min_version = cpu_to_le16(SMBD_V1); - packet->max_version = cpu_to_le16(SMBD_V1); + packet->min_version = cpu_to_le16(SMBDIRECT_V1); + packet->max_version = cpu_to_le16(SMBDIRECT_V1); packet->reserved = 0; - packet->credits_requested = cpu_to_le16(info->send_credit_target); - packet->preferred_send_size = cpu_to_le32(info->max_send_size); - packet->max_receive_size = cpu_to_le32(info->max_receive_size); + packet->credits_requested = cpu_to_le16(sp->send_credit_target); + packet->preferred_send_size = cpu_to_le32(sp->max_send_size); + packet->max_receive_size = cpu_to_le32(sp->max_recv_size); packet->max_fragmented_size = - cpu_to_le32(info->max_fragmented_recv_size); + cpu_to_le32(sp->max_fragmented_recv_size); request->num_sge = 1; request->sge[0].addr = ib_dma_map_single( - info->id->device, (void *)packet, + sc->ib.dev, (void *)packet, sizeof(*packet), DMA_TO_DEVICE); - if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) { + if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) { rc = -EIO; goto dma_mapping_failed; } request->sge[0].length = sizeof(*packet); - request->sge[0].lkey = info->pd->local_dma_lkey; + request->sge[0].lkey = sc->ib.pd->local_dma_lkey; ib_dma_sync_single_for_device( - info->id->device, request->sge[0].addr, + sc->ib.dev, request->sge[0].addr, request->sge[0].length, DMA_TO_DEVICE); request->cqe.done = send_done; @@ -734,14 +743,14 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info) request->sge[0].length, request->sge[0].lkey); atomic_inc(&info->send_pending); - rc = ib_post_send(info->id->qp, &send_wr, NULL); + rc = ib_post_send(sc->ib.qp, &send_wr, NULL); if (!rc) return 0; /* if we reach here, post send failed */ log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc); atomic_dec(&info->send_pending); - ib_dma_unmap_single(info->id->device, request->sge[0].addr, + ib_dma_unmap_single(sc->ib.dev, request->sge[0].addr, request->sge[0].length, DMA_TO_DEVICE); smbd_disconnect_rdma_connection(info); @@ -774,10 +783,10 @@ static int manage_credits_prior_sending(struct smbd_connection *info) /* * Check if we need to send a KEEP_ALIVE message * The idle connection timer triggers a KEEP_ALIVE message when expires - * SMB_DIRECT_RESPONSE_REQUESTED is set in the message flag to have peer send + * SMBDIRECT_FLAG_RESPONSE_REQUESTED is set in the message flag to have peer send * back a response. * return value: - * 1 if SMB_DIRECT_RESPONSE_REQUESTED needs to be set + * 1 if SMBDIRECT_FLAG_RESPONSE_REQUESTED needs to be set * 0: otherwise */ static int manage_keep_alive_before_sending(struct smbd_connection *info) @@ -793,6 +802,8 @@ static int manage_keep_alive_before_sending(struct smbd_connection *info) static int smbd_post_send(struct smbd_connection *info, struct smbd_request *request) { + struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; struct ib_send_wr send_wr; int rc, i; @@ -801,7 +812,7 @@ static int smbd_post_send(struct smbd_connection *info, "rdma_request sge[%d] addr=0x%llx length=%u\n", i, request->sge[i].addr, request->sge[i].length); ib_dma_sync_single_for_device( - info->id->device, + sc->ib.dev, request->sge[i].addr, request->sge[i].length, DMA_TO_DEVICE); @@ -816,7 +827,7 @@ static int smbd_post_send(struct smbd_connection *info, send_wr.opcode = IB_WR_SEND; send_wr.send_flags = IB_SEND_SIGNALED; - rc = ib_post_send(info->id->qp, &send_wr, NULL); + rc = ib_post_send(sc->ib.qp, &send_wr, NULL); if (rc) { log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc); smbd_disconnect_rdma_connection(info); @@ -824,7 +835,7 @@ static int smbd_post_send(struct smbd_connection *info, } else /* Reset timer for idle connection after packet is sent */ mod_delayed_work(info->workqueue, &info->idle_timer_work, - info->keep_alive_interval*HZ); + msecs_to_jiffies(sp->keepalive_interval_msec)); return rc; } @@ -833,22 +844,24 @@ static int smbd_post_send_iter(struct smbd_connection *info, struct iov_iter *iter, int *_remaining_data_length) { + struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; int i, rc; int header_length; int data_length; struct smbd_request *request; - struct smbd_data_transfer *packet; + struct smbdirect_data_transfer *packet; int new_credits = 0; wait_credit: /* Wait for send credits. A SMBD packet needs one credit */ rc = wait_event_interruptible(info->wait_send_queue, atomic_read(&info->send_credits) > 0 || - info->transport_status != SMBD_CONNECTED); + sc->status != SMBDIRECT_SOCKET_CONNECTED); if (rc) goto err_wait_credit; - if (info->transport_status != SMBD_CONNECTED) { + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { log_outgoing(ERR, "disconnected not sending on wait_credit\n"); rc = -EAGAIN; goto err_wait_credit; @@ -860,17 +873,17 @@ wait_credit: wait_send_queue: wait_event(info->wait_post_send, - atomic_read(&info->send_pending) < info->send_credit_target || - info->transport_status != SMBD_CONNECTED); + atomic_read(&info->send_pending) < sp->send_credit_target || + sc->status != SMBDIRECT_SOCKET_CONNECTED); - if (info->transport_status != SMBD_CONNECTED) { + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { log_outgoing(ERR, "disconnected not sending on wait_send_queue\n"); rc = -EAGAIN; goto err_wait_send_queue; } if (unlikely(atomic_inc_return(&info->send_pending) > - info->send_credit_target)) { + sp->send_credit_target)) { atomic_dec(&info->send_pending); goto wait_send_queue; } @@ -890,12 +903,14 @@ wait_send_queue: .nr_sge = 1, .max_sge = SMBDIRECT_MAX_SEND_SGE, .sge = request->sge, - .device = info->id->device, - .local_dma_lkey = info->pd->local_dma_lkey, + .device = sc->ib.dev, + .local_dma_lkey = sc->ib.pd->local_dma_lkey, .direction = DMA_TO_DEVICE, }; + size_t payload_len = umin(*_remaining_data_length, + sp->max_send_size - sizeof(*packet)); - rc = smb_extract_iter_to_rdma(iter, *_remaining_data_length, + rc = smb_extract_iter_to_rdma(iter, payload_len, &extract); if (rc < 0) goto err_dma; @@ -909,7 +924,7 @@ wait_send_queue: /* Fill in the packet header */ packet = smbd_request_payload(request); - packet->credits_requested = cpu_to_le16(info->send_credit_target); + packet->credits_requested = cpu_to_le16(sp->send_credit_target); new_credits = manage_credits_prior_sending(info); atomic_add(new_credits, &info->receive_credits); @@ -919,7 +934,7 @@ wait_send_queue: packet->flags = 0; if (manage_keep_alive_before_sending(info)) - packet->flags |= cpu_to_le16(SMB_DIRECT_RESPONSE_REQUESTED); + packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED); packet->reserved = 0; if (!data_length) @@ -938,23 +953,23 @@ wait_send_queue: le32_to_cpu(packet->remaining_data_length)); /* Map the packet to DMA */ - header_length = sizeof(struct smbd_data_transfer); + header_length = sizeof(struct smbdirect_data_transfer); /* If this is a packet without payload, don't send padding */ if (!data_length) - header_length = offsetof(struct smbd_data_transfer, padding); + header_length = offsetof(struct smbdirect_data_transfer, padding); - request->sge[0].addr = ib_dma_map_single(info->id->device, + request->sge[0].addr = ib_dma_map_single(sc->ib.dev, (void *)packet, header_length, DMA_TO_DEVICE); - if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) { + if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) { rc = -EIO; request->sge[0].addr = 0; goto err_dma; } request->sge[0].length = header_length; - request->sge[0].lkey = info->pd->local_dma_lkey; + request->sge[0].lkey = sc->ib.pd->local_dma_lkey; rc = smbd_post_send(info, request); if (!rc) @@ -963,7 +978,7 @@ wait_send_queue: err_dma: for (i = 0; i < request->num_sge; i++) if (request->sge[i].addr) - ib_dma_unmap_single(info->id->device, + ib_dma_unmap_single(sc->ib.dev, request->sge[i].addr, request->sge[i].length, DMA_TO_DEVICE); @@ -1000,6 +1015,27 @@ static int smbd_post_send_empty(struct smbd_connection *info) return smbd_post_send_iter(info, NULL, &remaining_data_length); } +static int smbd_post_send_full_iter(struct smbd_connection *info, + struct iov_iter *iter, + int *_remaining_data_length) +{ + int rc = 0; + + /* + * smbd_post_send_iter() respects the + * negotiated max_send_size, so we need to + * loop until the full iter is posted + */ + + while (iov_iter_count(iter) > 0) { + rc = smbd_post_send_iter(info, iter, _remaining_data_length); + if (rc < 0) + break; + } + + return rc; +} + /* * Post a receive request to the transport * The remote peer can only send data when a receive request is posted @@ -1008,17 +1044,19 @@ static int smbd_post_send_empty(struct smbd_connection *info) static int smbd_post_recv( struct smbd_connection *info, struct smbd_response *response) { + struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; struct ib_recv_wr recv_wr; int rc = -EIO; response->sge.addr = ib_dma_map_single( - info->id->device, response->packet, - info->max_receive_size, DMA_FROM_DEVICE); - if (ib_dma_mapping_error(info->id->device, response->sge.addr)) + sc->ib.dev, response->packet, + sp->max_recv_size, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(sc->ib.dev, response->sge.addr)) return rc; - response->sge.length = info->max_receive_size; - response->sge.lkey = info->pd->local_dma_lkey; + response->sge.length = sp->max_recv_size; + response->sge.lkey = sc->ib.pd->local_dma_lkey; response->cqe.done = recv_done; @@ -1027,9 +1065,9 @@ static int smbd_post_recv( recv_wr.sg_list = &response->sge; recv_wr.num_sge = 1; - rc = ib_post_recv(info->id->qp, &recv_wr, NULL); + rc = ib_post_recv(sc->ib.qp, &recv_wr, NULL); if (rc) { - ib_dma_unmap_single(info->id->device, response->sge.addr, + ib_dma_unmap_single(sc->ib.dev, response->sge.addr, response->sge.length, DMA_FROM_DEVICE); smbd_disconnect_rdma_connection(info); log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc); @@ -1187,9 +1225,10 @@ static struct smbd_response *get_receive_buffer(struct smbd_connection *info) static void put_receive_buffer( struct smbd_connection *info, struct smbd_response *response) { + struct smbdirect_socket *sc = &info->socket; unsigned long flags; - ib_dma_unmap_single(info->id->device, response->sge.addr, + ib_dma_unmap_single(sc->ib.dev, response->sge.addr, response->sge.length, DMA_FROM_DEVICE); spin_lock_irqsave(&info->receive_queue_lock, flags); @@ -1264,6 +1303,8 @@ static void idle_connection_timer(struct work_struct *work) struct smbd_connection *info = container_of( work, struct smbd_connection, idle_timer_work.work); + struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; if (info->keep_alive_requested != KEEP_ALIVE_NONE) { log_keep_alive(ERR, @@ -1278,7 +1319,7 @@ static void idle_connection_timer(struct work_struct *work) /* Setup the next idle timeout work */ queue_delayed_work(info->workqueue, &info->idle_timer_work, - info->keep_alive_interval*HZ); + msecs_to_jiffies(sp->keepalive_interval_msec)); } /* @@ -1289,6 +1330,8 @@ static void idle_connection_timer(struct work_struct *work) void smbd_destroy(struct TCP_Server_Info *server) { struct smbd_connection *info = server->smbd_conn; + struct smbdirect_socket *sc; + struct smbdirect_socket_parameters *sp; struct smbd_response *response; unsigned long flags; @@ -1296,19 +1339,22 @@ void smbd_destroy(struct TCP_Server_Info *server) log_rdma_event(INFO, "rdma session already destroyed\n"); return; } + sc = &info->socket; + sp = &sc->parameters; log_rdma_event(INFO, "destroying rdma session\n"); - if (info->transport_status != SMBD_DISCONNECTED) { - rdma_disconnect(server->smbd_conn->id); + if (sc->status != SMBDIRECT_SOCKET_DISCONNECTED) { + rdma_disconnect(sc->rdma.cm_id); log_rdma_event(INFO, "wait for transport being disconnected\n"); wait_event_interruptible( info->disconn_wait, - info->transport_status == SMBD_DISCONNECTED); + sc->status == SMBDIRECT_SOCKET_DISCONNECTED); } log_rdma_event(INFO, "destroying qp\n"); - ib_drain_qp(info->id->qp); - rdma_destroy_qp(info->id); + ib_drain_qp(sc->ib.qp); + rdma_destroy_qp(sc->rdma.cm_id); + sc->ib.qp = NULL; log_rdma_event(INFO, "cancelling idle timer\n"); cancel_delayed_work_sync(&info->idle_timer_work); @@ -1336,7 +1382,7 @@ void smbd_destroy(struct TCP_Server_Info *server) log_rdma_event(INFO, "free receive buffers\n"); wait_event(info->wait_receive_queues, info->count_receive_queue + info->count_empty_packet_queue - == info->receive_credit_max); + == sp->recv_credit_max); destroy_receive_buffers(info); /* @@ -1355,10 +1401,10 @@ void smbd_destroy(struct TCP_Server_Info *server) } destroy_mr_list(info); - ib_free_cq(info->send_cq); - ib_free_cq(info->recv_cq); - ib_dealloc_pd(info->pd); - rdma_destroy_id(info->id); + ib_free_cq(sc->ib.send_cq); + ib_free_cq(sc->ib.recv_cq); + ib_dealloc_pd(sc->ib.pd); + rdma_destroy_id(sc->rdma.cm_id); /* free mempools */ mempool_destroy(info->request_mempool); @@ -1367,7 +1413,7 @@ void smbd_destroy(struct TCP_Server_Info *server) mempool_destroy(info->response_mempool); kmem_cache_destroy(info->response_cache); - info->transport_status = SMBD_DESTROYED; + sc->status = SMBDIRECT_SOCKET_DESTROYED; destroy_workqueue(info->workqueue); log_rdma_event(INFO, "rdma session destroyed\n"); @@ -1392,7 +1438,7 @@ int smbd_reconnect(struct TCP_Server_Info *server) * This is possible if transport is disconnected and we haven't received * notification from RDMA, but upper layer has detected timeout */ - if (server->smbd_conn->transport_status == SMBD_CONNECTED) { + if (server->smbd_conn->socket.status == SMBDIRECT_SOCKET_CONNECTED) { log_rdma_event(INFO, "disconnecting transport\n"); smbd_destroy(server); } @@ -1424,37 +1470,47 @@ static void destroy_caches_and_workqueue(struct smbd_connection *info) #define MAX_NAME_LEN 80 static int allocate_caches_and_workqueue(struct smbd_connection *info) { + struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; char name[MAX_NAME_LEN]; int rc; + if (WARN_ON_ONCE(sp->max_recv_size < sizeof(struct smbdirect_data_transfer))) + return -ENOMEM; + scnprintf(name, MAX_NAME_LEN, "smbd_request_%p", info); info->request_cache = kmem_cache_create( name, sizeof(struct smbd_request) + - sizeof(struct smbd_data_transfer), + sizeof(struct smbdirect_data_transfer), 0, SLAB_HWCACHE_ALIGN, NULL); if (!info->request_cache) return -ENOMEM; info->request_mempool = - mempool_create(info->send_credit_target, mempool_alloc_slab, + mempool_create(sp->send_credit_target, mempool_alloc_slab, mempool_free_slab, info->request_cache); if (!info->request_mempool) goto out1; scnprintf(name, MAX_NAME_LEN, "smbd_response_%p", info); + + struct kmem_cache_args response_args = { + .align = __alignof__(struct smbd_response), + .useroffset = (offsetof(struct smbd_response, packet) + + sizeof(struct smbdirect_data_transfer)), + .usersize = sp->max_recv_size - sizeof(struct smbdirect_data_transfer), + }; info->response_cache = - kmem_cache_create( - name, - sizeof(struct smbd_response) + - info->max_receive_size, - 0, SLAB_HWCACHE_ALIGN, NULL); + kmem_cache_create(name, + sizeof(struct smbd_response) + sp->max_recv_size, + &response_args, SLAB_HWCACHE_ALIGN); if (!info->response_cache) goto out2; info->response_mempool = - mempool_create(info->receive_credit_max, mempool_alloc_slab, + mempool_create(sp->recv_credit_max, mempool_alloc_slab, mempool_free_slab, info->response_cache); if (!info->response_mempool) goto out3; @@ -1464,7 +1520,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info) if (!info->workqueue) goto out4; - rc = allocate_receive_buffers(info, info->receive_credit_max); + rc = allocate_receive_buffers(info, sp->recv_credit_max); if (rc) { log_rdma_event(ERR, "failed to allocate receive buffers\n"); goto out5; @@ -1491,6 +1547,8 @@ static struct smbd_connection *_smbd_get_connection( { int rc; struct smbd_connection *info; + struct smbdirect_socket *sc; + struct smbdirect_socket_parameters *sp; struct rdma_conn_param conn_param; struct ib_qp_init_attr qp_attr; struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr; @@ -1500,101 +1558,102 @@ static struct smbd_connection *_smbd_get_connection( info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL); if (!info) return NULL; + sc = &info->socket; + sp = &sc->parameters; - info->transport_status = SMBD_CONNECTING; + sc->status = SMBDIRECT_SOCKET_CONNECTING; rc = smbd_ia_open(info, dstaddr, port); if (rc) { log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc); goto create_id_failed; } - if (smbd_send_credit_target > info->id->device->attrs.max_cqe || - smbd_send_credit_target > info->id->device->attrs.max_qp_wr) { + if (smbd_send_credit_target > sc->ib.dev->attrs.max_cqe || + smbd_send_credit_target > sc->ib.dev->attrs.max_qp_wr) { log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", smbd_send_credit_target, - info->id->device->attrs.max_cqe, - info->id->device->attrs.max_qp_wr); + sc->ib.dev->attrs.max_cqe, + sc->ib.dev->attrs.max_qp_wr); goto config_failed; } - if (smbd_receive_credit_max > info->id->device->attrs.max_cqe || - smbd_receive_credit_max > info->id->device->attrs.max_qp_wr) { + if (smbd_receive_credit_max > sc->ib.dev->attrs.max_cqe || + smbd_receive_credit_max > sc->ib.dev->attrs.max_qp_wr) { log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", smbd_receive_credit_max, - info->id->device->attrs.max_cqe, - info->id->device->attrs.max_qp_wr); + sc->ib.dev->attrs.max_cqe, + sc->ib.dev->attrs.max_qp_wr); goto config_failed; } - info->receive_credit_max = smbd_receive_credit_max; - info->send_credit_target = smbd_send_credit_target; - info->max_send_size = smbd_max_send_size; - info->max_fragmented_recv_size = smbd_max_fragmented_recv_size; - info->max_receive_size = smbd_max_receive_size; - info->keep_alive_interval = smbd_keep_alive_interval; + sp->recv_credit_max = smbd_receive_credit_max; + sp->send_credit_target = smbd_send_credit_target; + sp->max_send_size = smbd_max_send_size; + sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size; + sp->max_recv_size = smbd_max_receive_size; + sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000; - if (info->id->device->attrs.max_send_sge < SMBDIRECT_MAX_SEND_SGE || - info->id->device->attrs.max_recv_sge < SMBDIRECT_MAX_RECV_SGE) { + if (sc->ib.dev->attrs.max_send_sge < SMBDIRECT_MAX_SEND_SGE || + sc->ib.dev->attrs.max_recv_sge < SMBDIRECT_MAX_RECV_SGE) { log_rdma_event(ERR, "device %.*s max_send_sge/max_recv_sge = %d/%d too small\n", IB_DEVICE_NAME_MAX, - info->id->device->name, - info->id->device->attrs.max_send_sge, - info->id->device->attrs.max_recv_sge); + sc->ib.dev->name, + sc->ib.dev->attrs.max_send_sge, + sc->ib.dev->attrs.max_recv_sge); goto config_failed; } - info->send_cq = NULL; - info->recv_cq = NULL; - info->send_cq = - ib_alloc_cq_any(info->id->device, info, - info->send_credit_target, IB_POLL_SOFTIRQ); - if (IS_ERR(info->send_cq)) { - info->send_cq = NULL; + sc->ib.send_cq = + ib_alloc_cq_any(sc->ib.dev, info, + sp->send_credit_target, IB_POLL_SOFTIRQ); + if (IS_ERR(sc->ib.send_cq)) { + sc->ib.send_cq = NULL; goto alloc_cq_failed; } - info->recv_cq = - ib_alloc_cq_any(info->id->device, info, - info->receive_credit_max, IB_POLL_SOFTIRQ); - if (IS_ERR(info->recv_cq)) { - info->recv_cq = NULL; + sc->ib.recv_cq = + ib_alloc_cq_any(sc->ib.dev, info, + sp->recv_credit_max, IB_POLL_SOFTIRQ); + if (IS_ERR(sc->ib.recv_cq)) { + sc->ib.recv_cq = NULL; goto alloc_cq_failed; } memset(&qp_attr, 0, sizeof(qp_attr)); qp_attr.event_handler = smbd_qp_async_error_upcall; qp_attr.qp_context = info; - qp_attr.cap.max_send_wr = info->send_credit_target; - qp_attr.cap.max_recv_wr = info->receive_credit_max; + qp_attr.cap.max_send_wr = sp->send_credit_target; + qp_attr.cap.max_recv_wr = sp->recv_credit_max; qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SEND_SGE; qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_RECV_SGE; qp_attr.cap.max_inline_data = 0; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; qp_attr.qp_type = IB_QPT_RC; - qp_attr.send_cq = info->send_cq; - qp_attr.recv_cq = info->recv_cq; + qp_attr.send_cq = sc->ib.send_cq; + qp_attr.recv_cq = sc->ib.recv_cq; qp_attr.port_num = ~0; - rc = rdma_create_qp(info->id, info->pd, &qp_attr); + rc = rdma_create_qp(sc->rdma.cm_id, sc->ib.pd, &qp_attr); if (rc) { log_rdma_event(ERR, "rdma_create_qp failed %i\n", rc); goto create_qp_failed; } + sc->ib.qp = sc->rdma.cm_id->qp; memset(&conn_param, 0, sizeof(conn_param)); conn_param.initiator_depth = 0; conn_param.responder_resources = - min(info->id->device->attrs.max_qp_rd_atom, + min(sc->ib.dev->attrs.max_qp_rd_atom, SMBD_CM_RESPONDER_RESOURCES); info->responder_resources = conn_param.responder_resources; log_rdma_mr(INFO, "responder_resources=%d\n", info->responder_resources); /* Need to send IRD/ORD in private data for iWARP */ - info->id->device->ops.get_port_immutable( - info->id->device, info->id->port_num, &port_immutable); + sc->ib.dev->ops.get_port_immutable( + sc->ib.dev, sc->rdma.cm_id->port_num, &port_immutable); if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) { ird_ord_hdr[0] = info->responder_resources; ird_ord_hdr[1] = 1; @@ -1615,16 +1674,16 @@ static struct smbd_connection *_smbd_get_connection( init_waitqueue_head(&info->conn_wait); init_waitqueue_head(&info->disconn_wait); init_waitqueue_head(&info->wait_reassembly_queue); - rc = rdma_connect(info->id, &conn_param); + rc = rdma_connect(sc->rdma.cm_id, &conn_param); if (rc) { log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc); goto rdma_connect_failed; } wait_event_interruptible( - info->conn_wait, info->transport_status != SMBD_CONNECTING); + info->conn_wait, sc->status != SMBDIRECT_SOCKET_CONNECTING); - if (info->transport_status != SMBD_CONNECTED) { + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { log_rdma_event(ERR, "rdma_connect failed port=%d\n", port); goto rdma_connect_failed; } @@ -1640,7 +1699,7 @@ static struct smbd_connection *_smbd_get_connection( init_waitqueue_head(&info->wait_send_queue); INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer); queue_delayed_work(info->workqueue, &info->idle_timer_work, - info->keep_alive_interval*HZ); + msecs_to_jiffies(sp->keepalive_interval_msec)); init_waitqueue_head(&info->wait_send_pending); atomic_set(&info->send_pending, 0); @@ -1675,26 +1734,26 @@ allocate_mr_failed: negotiation_failed: cancel_delayed_work_sync(&info->idle_timer_work); destroy_caches_and_workqueue(info); - info->transport_status = SMBD_NEGOTIATE_FAILED; + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; init_waitqueue_head(&info->conn_wait); - rdma_disconnect(info->id); + rdma_disconnect(sc->rdma.cm_id); wait_event(info->conn_wait, - info->transport_status == SMBD_DISCONNECTED); + sc->status == SMBDIRECT_SOCKET_DISCONNECTED); allocate_cache_failed: rdma_connect_failed: - rdma_destroy_qp(info->id); + rdma_destroy_qp(sc->rdma.cm_id); create_qp_failed: alloc_cq_failed: - if (info->send_cq) - ib_free_cq(info->send_cq); - if (info->recv_cq) - ib_free_cq(info->recv_cq); + if (sc->ib.send_cq) + ib_free_cq(sc->ib.send_cq); + if (sc->ib.recv_cq) + ib_free_cq(sc->ib.recv_cq); config_failed: - ib_dealloc_pd(info->pd); - rdma_destroy_id(info->id); + ib_dealloc_pd(sc->ib.pd); + rdma_destroy_id(sc->rdma.cm_id); create_id_failed: kfree(info); @@ -1719,34 +1778,39 @@ try_again: } /* - * Receive data from receive reassembly queue + * Receive data from the transport's receive reassembly queue * All the incoming data packets are placed in reassembly queue - * buf: the buffer to read data into + * iter: the buffer to read data into * size: the length of data to read * return value: actual data read - * Note: this implementation copies the data from reassebmly queue to receive + * + * Note: this implementation copies the data from reassembly queue to receive * buffers used by upper layer. This is not the optimal code path. A better way * to do it is to not have upper layer allocate its receive buffers but rather * borrow the buffer from reassembly queue, and return it after data is * consumed. But this will require more changes to upper layer code, and also * need to consider packet boundaries while they still being reassembled. */ -static int smbd_recv_buf(struct smbd_connection *info, char *buf, - unsigned int size) +int smbd_recv(struct smbd_connection *info, struct msghdr *msg) { + struct smbdirect_socket *sc = &info->socket; struct smbd_response *response; - struct smbd_data_transfer *data_transfer; + struct smbdirect_data_transfer *data_transfer; + size_t size = iov_iter_count(&msg->msg_iter); int to_copy, to_read, data_read, offset; u32 data_length, remaining_data_length, data_offset; int rc; + if (WARN_ON_ONCE(iov_iter_rw(&msg->msg_iter) == WRITE)) + return -EINVAL; /* It's a bug in upper layer to get there */ + again: /* * No need to hold the reassembly queue lock all the time as we are * the only one reading from the front of the queue. The transport * may add more entries to the back of the queue at the same time */ - log_read(INFO, "size=%d info->reassembly_data_length=%d\n", size, + log_read(INFO, "size=%zd info->reassembly_data_length=%d\n", size, info->reassembly_data_length); if (info->reassembly_data_length >= size) { int queue_length; @@ -1784,7 +1848,10 @@ again: if (response->first_segment && size == 4) { unsigned int rfc1002_len = data_length + remaining_data_length; - *((__be32 *)buf) = cpu_to_be32(rfc1002_len); + __be32 rfc1002_hdr = cpu_to_be32(rfc1002_len); + if (copy_to_iter(&rfc1002_hdr, sizeof(rfc1002_hdr), + &msg->msg_iter) != sizeof(rfc1002_hdr)) + return -EFAULT; data_read = 4; response->first_segment = false; log_read(INFO, "returning rfc1002 length %d\n", @@ -1793,10 +1860,9 @@ again: } to_copy = min_t(int, data_length - offset, to_read); - memcpy( - buf + data_read, - (char *)data_transfer + data_offset + offset, - to_copy); + if (copy_to_iter((char *)data_transfer + data_offset + offset, + to_copy, &msg->msg_iter) != to_copy) + return -EFAULT; /* move on to the next buffer? */ if (to_copy == data_length - offset) { @@ -1848,12 +1914,12 @@ read_rfc1002_done: rc = wait_event_interruptible( info->wait_reassembly_queue, info->reassembly_data_length >= size || - info->transport_status != SMBD_CONNECTED); + sc->status != SMBDIRECT_SOCKET_CONNECTED); /* Don't return any data if interrupted */ if (rc) return rc; - if (info->transport_status != SMBD_CONNECTED) { + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { log_read(ERR, "disconnected\n"); return -ECONNABORTED; } @@ -1862,89 +1928,6 @@ read_rfc1002_done: } /* - * Receive a page from receive reassembly queue - * page: the page to read data into - * to_read: the length of data to read - * return value: actual data read - */ -static int smbd_recv_page(struct smbd_connection *info, - struct page *page, unsigned int page_offset, - unsigned int to_read) -{ - int ret; - char *to_address; - void *page_address; - - /* make sure we have the page ready for read */ - ret = wait_event_interruptible( - info->wait_reassembly_queue, - info->reassembly_data_length >= to_read || - info->transport_status != SMBD_CONNECTED); - if (ret) - return ret; - - /* now we can read from reassembly queue and not sleep */ - page_address = kmap_atomic(page); - to_address = (char *) page_address + page_offset; - - log_read(INFO, "reading from page=%p address=%p to_read=%d\n", - page, to_address, to_read); - - ret = smbd_recv_buf(info, to_address, to_read); - kunmap_atomic(page_address); - - return ret; -} - -/* - * Receive data from transport - * msg: a msghdr point to the buffer, can be ITER_KVEC or ITER_BVEC - * return: total bytes read, or 0. SMB Direct will not do partial read. - */ -int smbd_recv(struct smbd_connection *info, struct msghdr *msg) -{ - char *buf; - struct page *page; - unsigned int to_read, page_offset; - int rc; - - if (iov_iter_rw(&msg->msg_iter) == WRITE) { - /* It's a bug in upper layer to get there */ - cifs_dbg(VFS, "Invalid msg iter dir %u\n", - iov_iter_rw(&msg->msg_iter)); - rc = -EINVAL; - goto out; - } - - switch (iov_iter_type(&msg->msg_iter)) { - case ITER_KVEC: - buf = msg->msg_iter.kvec->iov_base; - to_read = msg->msg_iter.kvec->iov_len; - rc = smbd_recv_buf(info, buf, to_read); - break; - - case ITER_BVEC: - page = msg->msg_iter.bvec->bv_page; - page_offset = msg->msg_iter.bvec->bv_offset; - to_read = msg->msg_iter.bvec->bv_len; - rc = smbd_recv_page(info, page, page_offset, to_read); - break; - - default: - /* It's a bug in upper layer to get there */ - cifs_dbg(VFS, "Invalid msg type %d\n", - iov_iter_type(&msg->msg_iter)); - rc = -EINVAL; - } - -out: - /* SMBDirect will read it all or nothing */ - if (rc > 0) - msg->msg_iter.count = 0; - return rc; -} - -/* * Send data to transport * Each rqst is transported as a SMBDirect payload * rqst: the data to write @@ -1954,12 +1937,14 @@ int smbd_send(struct TCP_Server_Info *server, int num_rqst, struct smb_rqst *rqst_array) { struct smbd_connection *info = server->smbd_conn; + struct smbdirect_socket *sc = &info->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; struct smb_rqst *rqst; struct iov_iter iter; unsigned int remaining_data_length, klen; int rc, i, rqst_idx; - if (info->transport_status != SMBD_CONNECTED) + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) return -EAGAIN; /* @@ -1971,10 +1956,10 @@ int smbd_send(struct TCP_Server_Info *server, for (i = 0; i < num_rqst; i++) remaining_data_length += smb_rqst_len(server, &rqst_array[i]); - if (unlikely(remaining_data_length > info->max_fragmented_send_size)) { + if (unlikely(remaining_data_length > sp->max_fragmented_send_size)) { /* assertion: payload never exceeds negotiated maximum */ log_write(ERR, "payload size %d > max size %d\n", - remaining_data_length, info->max_fragmented_send_size); + remaining_data_length, sp->max_fragmented_send_size); return -EINVAL; } @@ -2000,14 +1985,14 @@ int smbd_send(struct TCP_Server_Info *server, klen += rqst->rq_iov[i].iov_len; iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen); - rc = smbd_post_send_iter(info, &iter, &remaining_data_length); + rc = smbd_post_send_full_iter(info, &iter, &remaining_data_length); if (rc < 0) break; if (iov_iter_count(&rqst->rq_iter) > 0) { /* And then the data pages if there are any */ - rc = smbd_post_send_iter(info, &rqst->rq_iter, - &remaining_data_length); + rc = smbd_post_send_full_iter(info, &rqst->rq_iter, + &remaining_data_length); if (rc < 0) break; } @@ -2053,6 +2038,7 @@ static void smbd_mr_recovery_work(struct work_struct *work) { struct smbd_connection *info = container_of(work, struct smbd_connection, mr_recovery_work); + struct smbdirect_socket *sc = &info->socket; struct smbd_mr *smbdirect_mr; int rc; @@ -2070,7 +2056,7 @@ static void smbd_mr_recovery_work(struct work_struct *work) } smbdirect_mr->mr = ib_alloc_mr( - info->pd, info->mr_type, + sc->ib.pd, info->mr_type, info->max_frmr_depth); if (IS_ERR(smbdirect_mr->mr)) { log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n", @@ -2099,12 +2085,13 @@ static void smbd_mr_recovery_work(struct work_struct *work) static void destroy_mr_list(struct smbd_connection *info) { + struct smbdirect_socket *sc = &info->socket; struct smbd_mr *mr, *tmp; cancel_work_sync(&info->mr_recovery_work); list_for_each_entry_safe(mr, tmp, &info->mr_list, list) { if (mr->state == MR_INVALIDATED) - ib_dma_unmap_sg(info->id->device, mr->sgt.sgl, + ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); ib_dereg_mr(mr->mr); kfree(mr->sgt.sgl); @@ -2121,6 +2108,7 @@ static void destroy_mr_list(struct smbd_connection *info) */ static int allocate_mr_list(struct smbd_connection *info) { + struct smbdirect_socket *sc = &info->socket; int i; struct smbd_mr *smbdirect_mr, *tmp; @@ -2136,7 +2124,7 @@ static int allocate_mr_list(struct smbd_connection *info) smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL); if (!smbdirect_mr) goto cleanup_entries; - smbdirect_mr->mr = ib_alloc_mr(info->pd, info->mr_type, + smbdirect_mr->mr = ib_alloc_mr(sc->ib.pd, info->mr_type, info->max_frmr_depth); if (IS_ERR(smbdirect_mr->mr)) { log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n", @@ -2181,20 +2169,20 @@ cleanup_entries: */ static struct smbd_mr *get_mr(struct smbd_connection *info) { + struct smbdirect_socket *sc = &info->socket; struct smbd_mr *ret; int rc; again: rc = wait_event_interruptible(info->wait_mr, atomic_read(&info->mr_ready_count) || - info->transport_status != SMBD_CONNECTED); + sc->status != SMBDIRECT_SOCKET_CONNECTED); if (rc) { log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc); return NULL; } - if (info->transport_status != SMBD_CONNECTED) { - log_rdma_mr(ERR, "info->transport_status=%x\n", - info->transport_status); + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { + log_rdma_mr(ERR, "sc->status=%x\n", sc->status); return NULL; } @@ -2247,6 +2235,7 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info, struct iov_iter *iter, bool writing, bool need_invalidate) { + struct smbdirect_socket *sc = &info->socket; struct smbd_mr *smbdirect_mr; int rc, num_pages; enum dma_data_direction dir; @@ -2276,7 +2265,7 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info, num_pages, iov_iter_count(iter), info->max_frmr_depth); smbd_iter_to_mr(info, iter, &smbdirect_mr->sgt, info->max_frmr_depth); - rc = ib_dma_map_sg(info->id->device, smbdirect_mr->sgt.sgl, + rc = ib_dma_map_sg(sc->ib.dev, smbdirect_mr->sgt.sgl, smbdirect_mr->sgt.nents, dir); if (!rc) { log_rdma_mr(ERR, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n", @@ -2312,7 +2301,7 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info, * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution * on the next ib_post_send when we actually send I/O to remote peer */ - rc = ib_post_send(info->id->qp, ®_wr->wr, NULL); + rc = ib_post_send(sc->ib.qp, ®_wr->wr, NULL); if (!rc) return smbdirect_mr; @@ -2321,7 +2310,7 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info, /* If all failed, attempt to recover this MR by setting it MR_ERROR*/ map_mr_error: - ib_dma_unmap_sg(info->id->device, smbdirect_mr->sgt.sgl, + ib_dma_unmap_sg(sc->ib.dev, smbdirect_mr->sgt.sgl, smbdirect_mr->sgt.nents, smbdirect_mr->dir); dma_map_error: @@ -2359,6 +2348,7 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr) { struct ib_send_wr *wr; struct smbd_connection *info = smbdirect_mr->conn; + struct smbdirect_socket *sc = &info->socket; int rc = 0; if (smbdirect_mr->need_invalidate) { @@ -2372,7 +2362,7 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr) wr->send_flags = IB_SEND_SIGNALED; init_completion(&smbdirect_mr->invalidate_done); - rc = ib_post_send(info->id->qp, wr, NULL); + rc = ib_post_send(sc->ib.qp, wr, NULL); if (rc) { log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc); smbd_disconnect_rdma_connection(info); @@ -2389,7 +2379,7 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr) if (smbdirect_mr->state == MR_INVALIDATED) { ib_dma_unmap_sg( - info->id->device, smbdirect_mr->sgt.sgl, + sc->ib.dev, smbdirect_mr->sgt.sgl, smbdirect_mr->sgt.nents, smbdirect_mr->dir); smbdirect_mr->state = MR_READY; @@ -2552,13 +2542,14 @@ static ssize_t smb_extract_folioq_to_rdma(struct iov_iter *iter, size_t fsize = folioq_folio_size(folioq, slot); if (offset < fsize) { - size_t part = umin(maxsize - ret, fsize - offset); + size_t part = umin(maxsize, fsize - offset); if (!smb_set_sge(rdma, folio_page(folio, 0), offset, part)) return -EIO; offset += part; ret += part; + maxsize -= part; } if (offset >= fsize) { @@ -2573,7 +2564,7 @@ static ssize_t smb_extract_folioq_to_rdma(struct iov_iter *iter, slot = 0; } } - } while (rdma->nr_sge < rdma->max_sge || maxsize > 0); + } while (rdma->nr_sge < rdma->max_sge && maxsize > 0); iter->folioq = folioq; iter->folioq_slot = slot; diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index c08e3665150d..75b3f491c3ad 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -15,6 +15,9 @@ #include <rdma/rdma_cm.h> #include <linux/mempool.h> +#include "../common/smbdirect/smbdirect.h" +#include "../common/smbdirect/smbdirect_socket.h" + extern int rdma_readwrite_threshold; extern int smbd_max_frmr_depth; extern int smbd_keep_alive_interval; @@ -50,14 +53,8 @@ enum smbd_connection_status { * 5. mempools for allocating packets */ struct smbd_connection { - enum smbd_connection_status transport_status; - - /* RDMA related */ - struct rdma_cm_id *id; - struct ib_qp_init_attr qp_attr; - struct ib_pd *pd; - struct ib_cq *send_cq, *recv_cq; - struct ib_device_attr dev_attr; + struct smbdirect_socket socket; + int ri_rc; struct completion ri_done; wait_queue_head_t conn_wait; @@ -72,15 +69,7 @@ struct smbd_connection { spinlock_t lock_new_credits_offered; int new_credits_offered; - /* Connection parameters defined in [MS-SMBD] 3.1.1.1 */ - int receive_credit_max; - int send_credit_target; - int max_send_size; - int max_fragmented_recv_size; - int max_fragmented_send_size; - int max_receive_size; - int keep_alive_interval; - int max_readwrite_size; + /* dynamic connection parameters defined in [MS-SMBD] 3.1.1.1 */ enum keep_alive_status keep_alive_requested; int protocol; atomic_t send_credits; @@ -177,54 +166,6 @@ enum smbd_message_type { SMBD_TRANSFER_DATA, }; -#define SMB_DIRECT_RESPONSE_REQUESTED 0x0001 - -/* SMBD negotiation request packet [MS-SMBD] 2.2.1 */ -struct smbd_negotiate_req { - __le16 min_version; - __le16 max_version; - __le16 reserved; - __le16 credits_requested; - __le32 preferred_send_size; - __le32 max_receive_size; - __le32 max_fragmented_size; -} __packed; - -/* SMBD negotiation response packet [MS-SMBD] 2.2.2 */ -struct smbd_negotiate_resp { - __le16 min_version; - __le16 max_version; - __le16 negotiated_version; - __le16 reserved; - __le16 credits_requested; - __le16 credits_granted; - __le32 status; - __le32 max_readwrite_size; - __le32 preferred_send_size; - __le32 max_receive_size; - __le32 max_fragmented_size; -} __packed; - -/* SMBD data transfer packet with payload [MS-SMBD] 2.2.3 */ -struct smbd_data_transfer { - __le16 credits_requested; - __le16 credits_granted; - __le16 flags; - __le16 reserved; - __le32 remaining_data_length; - __le32 data_offset; - __le32 data_length; - __le32 padding; - __u8 buffer[]; -} __packed; - -/* The packet fields for a registered RDMA buffer */ -struct smbd_buffer_descriptor_v1 { - __le64 offset; - __le32 token; - __le32 length; -} __packed; - /* Maximum number of SGEs used by smbdirect.c in any send work request */ #define SMBDIRECT_MAX_SEND_SGE 6 diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index 52bcb55d9952..93e5b2bb9f28 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -140,7 +140,7 @@ DECLARE_EVENT_CLASS(smb3_rw_err_class, __entry->len = len; __entry->rc = rc; ), - TP_printk("\tR=%08x[%x] xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d", + TP_printk("R=%08x[%x] xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d", __entry->rreq_debug_id, __entry->rreq_debug_index, __entry->xid, __entry->sesid, __entry->tid, __entry->fid, __entry->offset, __entry->len, __entry->rc) @@ -190,7 +190,7 @@ DECLARE_EVENT_CLASS(smb3_other_err_class, __entry->len = len; __entry->rc = rc; ), - TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d", + TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d", __entry->xid, __entry->sesid, __entry->tid, __entry->fid, __entry->offset, __entry->len, __entry->rc) ) @@ -247,7 +247,7 @@ DECLARE_EVENT_CLASS(smb3_copy_range_err_class, __entry->len = len; __entry->rc = rc; ), - TP_printk("\txid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x rc=%d", + TP_printk("xid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x rc=%d", __entry->xid, __entry->sesid, __entry->tid, __entry->target_fid, __entry->src_offset, __entry->target_fid, __entry->target_offset, __entry->len, __entry->rc) ) @@ -298,7 +298,7 @@ DECLARE_EVENT_CLASS(smb3_copy_range_done_class, __entry->target_offset = target_offset; __entry->len = len; ), - TP_printk("\txid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x", + TP_printk("xid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x", __entry->xid, __entry->sesid, __entry->tid, __entry->target_fid, __entry->src_offset, __entry->target_fid, __entry->target_offset, __entry->len) ) @@ -482,7 +482,7 @@ DECLARE_EVENT_CLASS(smb3_fd_class, __entry->tid = tid; __entry->sesid = sesid; ), - TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx", + TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx", __entry->xid, __entry->sesid, __entry->tid, __entry->fid) ) @@ -521,7 +521,7 @@ DECLARE_EVENT_CLASS(smb3_fd_err_class, __entry->sesid = sesid; __entry->rc = rc; ), - TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx rc=%d", + TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx rc=%d", __entry->xid, __entry->sesid, __entry->tid, __entry->fid, __entry->rc) ) @@ -794,7 +794,7 @@ DECLARE_EVENT_CLASS(smb3_cmd_err_class, __entry->status = status; __entry->rc = rc; ), - TP_printk("\tsid=0x%llx tid=0x%x cmd=%u mid=%llu status=0x%x rc=%d", + TP_printk("sid=0x%llx tid=0x%x cmd=%u mid=%llu status=0x%x rc=%d", __entry->sesid, __entry->tid, __entry->cmd, __entry->mid, __entry->status, __entry->rc) ) @@ -829,7 +829,7 @@ DECLARE_EVENT_CLASS(smb3_cmd_done_class, __entry->cmd = cmd; __entry->mid = mid; ), - TP_printk("\tsid=0x%llx tid=0x%x cmd=%u mid=%llu", + TP_printk("sid=0x%llx tid=0x%x cmd=%u mid=%llu", __entry->sesid, __entry->tid, __entry->cmd, __entry->mid) ) @@ -867,7 +867,7 @@ DECLARE_EVENT_CLASS(smb3_mid_class, __entry->when_sent = when_sent; __entry->when_received = when_received; ), - TP_printk("\tcmd=%u mid=%llu pid=%u, when_sent=%lu when_rcv=%lu", + TP_printk("cmd=%u mid=%llu pid=%u, when_sent=%lu when_rcv=%lu", __entry->cmd, __entry->mid, __entry->pid, __entry->when_sent, __entry->when_received) ) @@ -898,7 +898,7 @@ DECLARE_EVENT_CLASS(smb3_exit_err_class, __assign_str(func_name); __entry->rc = rc; ), - TP_printk("\t%s: xid=%u rc=%d", + TP_printk("%s: xid=%u rc=%d", __get_str(func_name), __entry->xid, __entry->rc) ) @@ -924,7 +924,7 @@ DECLARE_EVENT_CLASS(smb3_sync_err_class, __entry->ino = ino; __entry->rc = rc; ), - TP_printk("\tino=%lu rc=%d", + TP_printk("ino=%lu rc=%d", __entry->ino, __entry->rc) ) @@ -950,7 +950,7 @@ DECLARE_EVENT_CLASS(smb3_enter_exit_class, __entry->xid = xid; __assign_str(func_name); ), - TP_printk("\t%s: xid=%u", + TP_printk("%s: xid=%u", __get_str(func_name), __entry->xid) ) diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 266af17aa7d9..191783f553ce 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -1018,14 +1018,16 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses) uint index = 0; unsigned int min_in_flight = UINT_MAX, max_in_flight = 0; struct TCP_Server_Info *server = NULL; - int i; + int i, start, cur; if (!ses) return NULL; spin_lock(&ses->chan_lock); + start = atomic_inc_return(&ses->chan_seq); for (i = 0; i < ses->chan_count; i++) { - server = ses->chans[i].server; + cur = (start + i) % ses->chan_count; + server = ses->chans[cur].server; if (!server || server->terminate) continue; @@ -1042,17 +1044,15 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses) */ if (server->in_flight < min_in_flight) { min_in_flight = server->in_flight; - index = i; + index = cur; } if (server->in_flight > max_in_flight) max_in_flight = server->in_flight; } /* if all channels are equally loaded, fall back to round-robin */ - if (min_in_flight == max_in_flight) { - index = (uint)atomic_inc_return(&ses->chan_seq); - index %= ses->chan_count; - } + if (min_in_flight == max_in_flight) + index = (uint)start % ses->chan_count; server = ses->chans[index].server; spin_unlock(&ses->chan_lock); diff --git a/fs/smb/common/smbdirect/smbdirect.h b/fs/smb/common/smbdirect/smbdirect.h new file mode 100644 index 000000000000..b9a385344ff3 --- /dev/null +++ b/fs/smb/common/smbdirect/smbdirect.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2017, Microsoft Corporation. + * Copyright (C) 2018, LG Electronics. + */ + +#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__ +#define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__ + +/* SMB-DIRECT buffer descriptor V1 structure [MS-SMBD] 2.2.3.1 */ +struct smbdirect_buffer_descriptor_v1 { + __le64 offset; + __le32 token; + __le32 length; +} __packed; + +/* + * Connection parameters mostly from [MS-SMBD] 3.1.1.1 + * + * These are setup and negotiated at the beginning of a + * connection and remain constant unless explicitly changed. + * + * Some values are important for the upper layer. + */ +struct smbdirect_socket_parameters { + __u16 recv_credit_max; + __u16 send_credit_target; + __u32 max_send_size; + __u32 max_fragmented_send_size; + __u32 max_recv_size; + __u32 max_fragmented_recv_size; + __u32 max_read_write_size; + __u32 keepalive_interval_msec; + __u32 keepalive_timeout_msec; +} __packed; + +#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__ */ diff --git a/fs/smb/common/smbdirect/smbdirect_pdu.h b/fs/smb/common/smbdirect/smbdirect_pdu.h new file mode 100644 index 000000000000..ae9fdb05ce23 --- /dev/null +++ b/fs/smb/common/smbdirect/smbdirect_pdu.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2017 Stefan Metzmacher + */ + +#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PDU_H__ +#define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PDU_H__ + +#define SMBDIRECT_V1 0x0100 + +/* SMBD negotiation request packet [MS-SMBD] 2.2.1 */ +struct smbdirect_negotiate_req { + __le16 min_version; + __le16 max_version; + __le16 reserved; + __le16 credits_requested; + __le32 preferred_send_size; + __le32 max_receive_size; + __le32 max_fragmented_size; +} __packed; + +/* SMBD negotiation response packet [MS-SMBD] 2.2.2 */ +struct smbdirect_negotiate_resp { + __le16 min_version; + __le16 max_version; + __le16 negotiated_version; + __le16 reserved; + __le16 credits_requested; + __le16 credits_granted; + __le32 status; + __le32 max_readwrite_size; + __le32 preferred_send_size; + __le32 max_receive_size; + __le32 max_fragmented_size; +} __packed; + +#define SMBDIRECT_DATA_MIN_HDR_SIZE 0x14 +#define SMBDIRECT_DATA_OFFSET 0x18 + +#define SMBDIRECT_FLAG_RESPONSE_REQUESTED 0x0001 + +/* SMBD data transfer packet with payload [MS-SMBD] 2.2.3 */ +struct smbdirect_data_transfer { + __le16 credits_requested; + __le16 credits_granted; + __le16 flags; + __le16 reserved; + __le32 remaining_data_length; + __le32 data_offset; + __le32 data_length; + __le32 padding; + __u8 buffer[]; +} __packed; + +#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PDU_H__ */ diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h new file mode 100644 index 000000000000..e5b15cc44a7b --- /dev/null +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2025 Stefan Metzmacher + */ + +#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__ +#define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__ + +enum smbdirect_socket_status { + SMBDIRECT_SOCKET_CREATED, + SMBDIRECT_SOCKET_CONNECTING, + SMBDIRECT_SOCKET_CONNECTED, + SMBDIRECT_SOCKET_NEGOTIATE_FAILED, + SMBDIRECT_SOCKET_DISCONNECTING, + SMBDIRECT_SOCKET_DISCONNECTED, + SMBDIRECT_SOCKET_DESTROYED +}; + +struct smbdirect_socket { + enum smbdirect_socket_status status; + + /* RDMA related */ + struct { + struct rdma_cm_id *cm_id; + } rdma; + + /* IB verbs related */ + struct { + struct ib_pd *pd; + struct ib_cq *send_cq; + struct ib_cq *recv_cq; + + /* + * shortcuts for rdma.cm_id->{qp,device}; + */ + struct ib_qp *qp; + struct ib_device *dev; + } ib; + + struct smbdirect_socket_parameters parameters; +}; + +#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__ */ diff --git a/fs/smb/server/Kconfig b/fs/smb/server/Kconfig index cf70e96ad4de..4a23a5e7e8fe 100644 --- a/fs/smb/server/Kconfig +++ b/fs/smb/server/Kconfig @@ -11,6 +11,7 @@ config SMB_SERVER select CRYPTO_HMAC select CRYPTO_ECB select CRYPTO_LIB_DES + select CRYPTO_LIB_SHA256 select CRYPTO_SHA256 select CRYPTO_CMAC select CRYPTO_SHA512 diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c index b3d121052408..d99871c21451 100644 --- a/fs/smb/server/auth.c +++ b/fs/smb/server/auth.c @@ -979,40 +979,6 @@ out: return rc; } -int ksmbd_gen_sd_hash(struct ksmbd_conn *conn, char *sd_buf, int len, - __u8 *pi_hash) -{ - int rc; - struct ksmbd_crypto_ctx *ctx = NULL; - - ctx = ksmbd_crypto_ctx_find_sha256(); - if (!ctx) { - ksmbd_debug(AUTH, "could not alloc sha256\n"); - return -ENOMEM; - } - - rc = crypto_shash_init(CRYPTO_SHA256(ctx)); - if (rc) { - ksmbd_debug(AUTH, "could not init shashn"); - goto out; - } - - rc = crypto_shash_update(CRYPTO_SHA256(ctx), sd_buf, len); - if (rc) { - ksmbd_debug(AUTH, "could not update with n\n"); - goto out; - } - - rc = crypto_shash_final(CRYPTO_SHA256(ctx), pi_hash); - if (rc) { - ksmbd_debug(AUTH, "Could not generate hash err : %d\n", rc); - goto out; - } -out: - ksmbd_release_crypto_ctx(ctx); - return rc; -} - static int ksmbd_get_encryption_key(struct ksmbd_work *work, __u64 ses_id, int enc, u8 *key) { diff --git a/fs/smb/server/auth.h b/fs/smb/server/auth.h index 362b6159a6cf..6879a1bd1b91 100644 --- a/fs/smb/server/auth.h +++ b/fs/smb/server/auth.h @@ -66,6 +66,4 @@ int ksmbd_gen_smb311_encryptionkey(struct ksmbd_conn *conn, struct ksmbd_session *sess); int ksmbd_gen_preauth_integrity_hash(struct ksmbd_conn *conn, char *buf, __u8 *pi_hash); -int ksmbd_gen_sd_hash(struct ksmbd_conn *conn, char *sd_buf, int len, - __u8 *pi_hash); #endif diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index 83764c230e9d..3f04a2977ba8 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -40,7 +40,7 @@ void ksmbd_conn_free(struct ksmbd_conn *conn) kvfree(conn->request_buf); kfree(conn->preauth_info); if (atomic_dec_and_test(&conn->refcnt)) { - ksmbd_free_transport(conn->transport); + conn->transport->ops->free_transport(conn->transport); kfree(conn); } } diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 14620e147dda..dd3e0e3f7bf0 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -108,6 +108,7 @@ struct ksmbd_conn { __le16 signing_algorithm; bool binding; atomic_t refcnt; + bool is_aapl; }; struct ksmbd_conn_ops { @@ -132,6 +133,7 @@ struct ksmbd_transport_ops { void *buf, unsigned int len, struct smb2_buffer_desc_v1 *desc, unsigned int desc_len); + void (*free_transport)(struct ksmbd_transport *kt); }; struct ksmbd_transport { diff --git a/fs/smb/server/crypto_ctx.c b/fs/smb/server/crypto_ctx.c index ce733dc9a4a3..80bd68c8635e 100644 --- a/fs/smb/server/crypto_ctx.c +++ b/fs/smb/server/crypto_ctx.c @@ -75,9 +75,6 @@ static struct shash_desc *alloc_shash_desc(int id) case CRYPTO_SHASH_CMACAES: tfm = crypto_alloc_shash("cmac(aes)", 0, 0); break; - case CRYPTO_SHASH_SHA256: - tfm = crypto_alloc_shash("sha256", 0, 0); - break; case CRYPTO_SHASH_SHA512: tfm = crypto_alloc_shash("sha512", 0, 0); break; @@ -198,11 +195,6 @@ struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_cmacaes(void) return ____crypto_shash_ctx_find(CRYPTO_SHASH_CMACAES); } -struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha256(void) -{ - return ____crypto_shash_ctx_find(CRYPTO_SHASH_SHA256); -} - struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha512(void) { return ____crypto_shash_ctx_find(CRYPTO_SHASH_SHA512); diff --git a/fs/smb/server/crypto_ctx.h b/fs/smb/server/crypto_ctx.h index 4a367c62f653..ac64801d52d3 100644 --- a/fs/smb/server/crypto_ctx.h +++ b/fs/smb/server/crypto_ctx.h @@ -13,7 +13,6 @@ enum { CRYPTO_SHASH_HMACMD5 = 0, CRYPTO_SHASH_HMACSHA256, CRYPTO_SHASH_CMACAES, - CRYPTO_SHASH_SHA256, CRYPTO_SHASH_SHA512, CRYPTO_SHASH_MAX, }; @@ -39,14 +38,12 @@ struct ksmbd_crypto_ctx { #define CRYPTO_HMACMD5(c) ((c)->desc[CRYPTO_SHASH_HMACMD5]) #define CRYPTO_HMACSHA256(c) ((c)->desc[CRYPTO_SHASH_HMACSHA256]) #define CRYPTO_CMACAES(c) ((c)->desc[CRYPTO_SHASH_CMACAES]) -#define CRYPTO_SHA256(c) ((c)->desc[CRYPTO_SHASH_SHA256]) #define CRYPTO_SHA512(c) ((c)->desc[CRYPTO_SHASH_SHA512]) #define CRYPTO_HMACMD5_TFM(c) ((c)->desc[CRYPTO_SHASH_HMACMD5]->tfm) #define CRYPTO_HMACSHA256_TFM(c)\ ((c)->desc[CRYPTO_SHASH_HMACSHA256]->tfm) #define CRYPTO_CMACAES_TFM(c) ((c)->desc[CRYPTO_SHASH_CMACAES]->tfm) -#define CRYPTO_SHA256_TFM(c) ((c)->desc[CRYPTO_SHASH_SHA256]->tfm) #define CRYPTO_SHA512_TFM(c) ((c)->desc[CRYPTO_SHASH_SHA512]->tfm) #define CRYPTO_GCM(c) ((c)->ccmaes[CRYPTO_AEAD_AES_GCM]) @@ -57,7 +54,6 @@ struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_hmacmd5(void); struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_hmacsha256(void); struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_cmacaes(void); struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha512(void); -struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha256(void); struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_gcm(void); struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_ccm(void); void ksmbd_crypto_destroy(void); diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index ab533c602987..8c9c49c3a0a4 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -631,6 +631,5 @@ MODULE_SOFTDEP("pre: sha512"); MODULE_SOFTDEP("pre: aead2"); MODULE_SOFTDEP("pre: ccm"); MODULE_SOFTDEP("pre: gcm"); -MODULE_SOFTDEP("pre: crc32"); module_init(ksmbd_server_init) module_exit(ksmbd_server_exit) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 8d414239b3fe..0d92ce49aed7 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1594,7 +1594,7 @@ static int krb5_authenticate(struct ksmbd_work *work, struct ksmbd_conn *conn = work->conn; struct ksmbd_session *sess = work->sess; char *in_blob, *out_blob; - struct channel *chann = NULL; + struct channel *chann = NULL, *old; u64 prev_sess_id; int in_len, out_len; int retval; @@ -1607,24 +1607,38 @@ static int krb5_authenticate(struct ksmbd_work *work, out_len = work->response_sz - (le16_to_cpu(rsp->SecurityBufferOffset) + 4); - /* Check previous session */ - prev_sess_id = le64_to_cpu(req->PreviousSessionId); - if (prev_sess_id && prev_sess_id != sess->id) - destroy_previous_session(conn, sess->user, prev_sess_id); - retval = ksmbd_krb5_authenticate(sess, in_blob, in_len, out_blob, &out_len); if (retval) { ksmbd_debug(SMB, "krb5 authentication failed\n"); return -EINVAL; } + + /* Check previous session */ + prev_sess_id = le64_to_cpu(req->PreviousSessionId); + if (prev_sess_id && prev_sess_id != sess->id) + destroy_previous_session(conn, sess->user, prev_sess_id); + rsp->SecurityBufferLength = cpu_to_le16(out_len); - if ((conn->sign || server_conf.enforced_signing) || + /* + * If session state is SMB2_SESSION_VALID, We can assume + * that it is reauthentication. And the user/password + * has been verified, so return it here. + */ + if (sess->state == SMB2_SESSION_VALID) { + if (conn->binding) + goto binding_session; + return 0; + } + + if ((rsp->SessionFlags != SMB2_SESSION_FLAG_IS_GUEST_LE && + (conn->sign || server_conf.enforced_signing)) || (req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED)) sess->sign = true; - if (smb3_encryption_negotiated(conn)) { + if (smb3_encryption_negotiated(conn) && + !(req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) { retval = conn->ops->generate_encryptionkey(conn, sess); if (retval) { ksmbd_debug(SMB, @@ -1637,6 +1651,7 @@ static int krb5_authenticate(struct ksmbd_work *work, sess->sign = false; } +binding_session: if (conn->dialect >= SMB30_PROT_ID) { chann = lookup_chann_list(sess, conn); if (!chann) { @@ -1645,7 +1660,12 @@ static int krb5_authenticate(struct ksmbd_work *work, return -ENOMEM; chann->conn = conn; - xa_store(&sess->ksmbd_chann_list, (long)conn, chann, KSMBD_DEFAULT_GFP); + old = xa_store(&sess->ksmbd_chann_list, (long)conn, + chann, KSMBD_DEFAULT_GFP); + if (xa_is_err(old)) { + kfree(chann); + return xa_err(old); + } } } @@ -1832,8 +1852,6 @@ int smb2_sess_setup(struct ksmbd_work *work) ksmbd_conn_set_good(conn); sess->state = SMB2_SESSION_VALID; } - kfree(sess->Preauth_HashValue); - sess->Preauth_HashValue = NULL; } else if (conn->preferred_auth_mech == KSMBD_AUTH_NTLMSSP) { if (negblob->MessageType == NtLmNegotiate) { rc = ntlm_negotiate(work, negblob, negblob_len, rsp); @@ -1860,8 +1878,6 @@ int smb2_sess_setup(struct ksmbd_work *work) kfree(preauth_sess); } } - kfree(sess->Preauth_HashValue); - sess->Preauth_HashValue = NULL; } else { pr_info_ratelimited("Unknown NTLMSSP message type : 0x%x\n", le32_to_cpu(negblob->MessageType)); @@ -2580,7 +2596,7 @@ static void smb2_update_xattrs(struct ksmbd_tree_connect *tcon, } } -static int smb2_creat(struct ksmbd_work *work, struct path *parent_path, +static int smb2_creat(struct ksmbd_work *work, struct path *path, char *name, int open_flags, umode_t posix_mode, bool is_dir) { @@ -2609,7 +2625,7 @@ static int smb2_creat(struct ksmbd_work *work, struct path *parent_path, return rc; } - rc = ksmbd_vfs_kern_path_locked(work, name, 0, parent_path, path, 0); + rc = ksmbd_vfs_kern_path(work, name, 0, path, 0); if (rc) { pr_err("cannot get linux path (%s), err = %d\n", name, rc); @@ -2859,7 +2875,7 @@ int smb2_open(struct ksmbd_work *work) struct ksmbd_tree_connect *tcon = work->tcon; struct smb2_create_req *req; struct smb2_create_rsp *rsp; - struct path path, parent_path; + struct path path; struct ksmbd_share_config *share = tcon->share_conf; struct ksmbd_file *fp = NULL; struct file *filp = NULL; @@ -2874,7 +2890,7 @@ int smb2_open(struct ksmbd_work *work) int req_op_level = 0, open_flags = 0, may_flags = 0, file_info = 0; int rc = 0; int contxt_cnt = 0, query_disk_id = 0; - int maximal_access_ctxt = 0, posix_ctxt = 0; + bool maximal_access_ctxt = false, posix_ctxt = false; int s_type = 0; int next_off = 0; char *name = NULL; @@ -2903,6 +2919,27 @@ int smb2_open(struct ksmbd_work *work) return create_smb2_pipe(work); } + if (req->CreateContextsOffset && tcon->posix_extensions) { + context = smb2_find_context_vals(req, SMB2_CREATE_TAG_POSIX, 16); + if (IS_ERR(context)) { + rc = PTR_ERR(context); + goto err_out2; + } else if (context) { + struct create_posix *posix = (struct create_posix *)context; + + if (le16_to_cpu(context->DataOffset) + + le32_to_cpu(context->DataLength) < + sizeof(struct create_posix) - 4) { + rc = -EINVAL; + goto err_out2; + } + ksmbd_debug(SMB, "get posix context\n"); + + posix_mode = le32_to_cpu(posix->Mode); + posix_ctxt = true; + } + } + if (req->NameLength) { name = smb2_get_name((char *)req + le16_to_cpu(req->NameOffset), le16_to_cpu(req->NameLength), @@ -2925,9 +2962,11 @@ int smb2_open(struct ksmbd_work *work) goto err_out2; } - rc = ksmbd_validate_filename(name); - if (rc < 0) - goto err_out2; + if (posix_ctxt == false) { + rc = ksmbd_validate_filename(name); + if (rc < 0) + goto err_out2; + } if (ksmbd_share_veto_filename(share, name)) { rc = -ENOENT; @@ -3085,28 +3124,6 @@ int smb2_open(struct ksmbd_work *work) rc = -EBADF; goto err_out2; } - - if (tcon->posix_extensions) { - context = smb2_find_context_vals(req, - SMB2_CREATE_TAG_POSIX, 16); - if (IS_ERR(context)) { - rc = PTR_ERR(context); - goto err_out2; - } else if (context) { - struct create_posix *posix = - (struct create_posix *)context; - if (le16_to_cpu(context->DataOffset) + - le32_to_cpu(context->DataLength) < - sizeof(struct create_posix) - 4) { - rc = -EINVAL; - goto err_out2; - } - ksmbd_debug(SMB, "get posix context\n"); - - posix_mode = le32_to_cpu(posix->Mode); - posix_ctxt = 1; - } - } } if (ksmbd_override_fsids(work)) { @@ -3114,8 +3131,8 @@ int smb2_open(struct ksmbd_work *work) goto err_out2; } - rc = ksmbd_vfs_kern_path_locked(work, name, LOOKUP_NO_SYMLINKS, - &parent_path, &path, 1); + rc = ksmbd_vfs_kern_path(work, name, LOOKUP_NO_SYMLINKS, + &path, 1); if (!rc) { file_present = true; @@ -3236,7 +3253,7 @@ int smb2_open(struct ksmbd_work *work) /*create file if not present */ if (!file_present) { - rc = smb2_creat(work, &parent_path, &path, name, open_flags, + rc = smb2_creat(work, &path, name, open_flags, posix_mode, req->CreateOptions & FILE_DIRECTORY_FILE_LE); if (rc) { @@ -3441,7 +3458,7 @@ int smb2_open(struct ksmbd_work *work) } if (file_present || created) - ksmbd_vfs_kern_path_unlock(&parent_path, &path); + path_put(&path); if (!S_ISDIR(file_inode(filp)->i_mode) && open_flags & O_TRUNC && !fp->attrib_only && !stream_name) { @@ -3539,6 +3556,15 @@ int smb2_open(struct ksmbd_work *work) ksmbd_debug(SMB, "get query on disk id context\n"); query_disk_id = 1; } + + if (conn->is_aapl == false) { + context = smb2_find_context_vals(req, SMB2_CREATE_AAPL, 4); + if (IS_ERR(context)) { + rc = PTR_ERR(context); + goto err_out1; + } else if (context) + conn->is_aapl = true; + } } rc = ksmbd_vfs_getattr(&path, &stat); @@ -3713,7 +3739,7 @@ reconnected_fp: err_out: if (rc && (file_present || created)) - ksmbd_vfs_kern_path_unlock(&parent_path, &path); + path_put(&path); err_out1: ksmbd_revert_fsids(work); @@ -3978,7 +4004,10 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level, if (dinfo->EaSize) dinfo->ExtFileAttributes = FILE_ATTRIBUTE_REPARSE_POINT_LE; dinfo->Reserved = 0; - dinfo->UniqueId = cpu_to_le64(ksmbd_kstat->kstat->ino); + if (conn->is_aapl) + dinfo->UniqueId = 0; + else + dinfo->UniqueId = cpu_to_le64(ksmbd_kstat->kstat->ino); if (d_info->hide_dot_file && d_info->name[0] == '.') dinfo->ExtFileAttributes |= FILE_ATTRIBUTE_HIDDEN_LE; memcpy(dinfo->FileName, conv_name, conv_len); @@ -3995,7 +4024,10 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level, smb2_get_reparse_tag_special_file(ksmbd_kstat->kstat->mode); if (fibdinfo->EaSize) fibdinfo->ExtFileAttributes = FILE_ATTRIBUTE_REPARSE_POINT_LE; - fibdinfo->UniqueId = cpu_to_le64(ksmbd_kstat->kstat->ino); + if (conn->is_aapl) + fibdinfo->UniqueId = 0; + else + fibdinfo->UniqueId = cpu_to_le64(ksmbd_kstat->kstat->ino); fibdinfo->ShortNameLength = 0; fibdinfo->Reserved = 0; fibdinfo->Reserved2 = cpu_to_le16(0); @@ -4091,20 +4123,6 @@ struct smb2_query_dir_private { int info_level; }; -static void lock_dir(struct ksmbd_file *dir_fp) -{ - struct dentry *dir = dir_fp->filp->f_path.dentry; - - inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); -} - -static void unlock_dir(struct ksmbd_file *dir_fp) -{ - struct dentry *dir = dir_fp->filp->f_path.dentry; - - inode_unlock(d_inode(dir)); -} - static int process_query_dir_entries(struct smb2_query_dir_private *priv) { struct mnt_idmap *idmap = file_mnt_idmap(priv->dir_fp->filp); @@ -4119,12 +4137,10 @@ static int process_query_dir_entries(struct smb2_query_dir_private *priv) if (dentry_name(priv->d_info, priv->info_level)) return -EINVAL; - lock_dir(priv->dir_fp); - dent = lookup_one(idmap, - &QSTR_LEN(priv->d_info->name, - priv->d_info->name_len), - priv->dir_fp->filp->f_path.dentry); - unlock_dir(priv->dir_fp); + dent = lookup_one_unlocked(idmap, + &QSTR_LEN(priv->d_info->name, + priv->d_info->name_len), + priv->dir_fp->filp->f_path.dentry); if (IS_ERR(dent)) { ksmbd_debug(SMB, "Cannot lookup `%s' [%ld]\n", @@ -4855,8 +4871,13 @@ static int get_file_standard_info(struct smb2_query_info_rsp *rsp, sinfo = (struct smb2_file_standard_info *)rsp->Buffer; delete_pending = ksmbd_inode_pending_delete(fp); - sinfo->AllocationSize = cpu_to_le64(stat.blocks << 9); - sinfo->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); + if (ksmbd_stream_fd(fp) == false) { + sinfo->AllocationSize = cpu_to_le64(stat.blocks << 9); + sinfo->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); + } else { + sinfo->AllocationSize = cpu_to_le64(fp->stream.size); + sinfo->EndOfFile = cpu_to_le64(fp->stream.size); + } sinfo->NumberOfLinks = cpu_to_le32(get_nlink(&stat) - delete_pending); sinfo->DeletePending = delete_pending; sinfo->Directory = S_ISDIR(stat.mode) ? 1 : 0; @@ -4919,9 +4940,14 @@ static int get_file_all_info(struct ksmbd_work *work, file_info->ChangeTime = cpu_to_le64(time); file_info->Attributes = fp->f_ci->m_fattr; file_info->Pad1 = 0; - file_info->AllocationSize = - cpu_to_le64(stat.blocks << 9); - file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); + if (ksmbd_stream_fd(fp) == false) { + file_info->AllocationSize = + cpu_to_le64(stat.blocks << 9); + file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); + } else { + file_info->AllocationSize = cpu_to_le64(fp->stream.size); + file_info->EndOfFile = cpu_to_le64(fp->stream.size); + } file_info->NumberOfLinks = cpu_to_le32(get_nlink(&stat) - delete_pending); file_info->DeletePending = delete_pending; @@ -4930,7 +4956,10 @@ static int get_file_all_info(struct ksmbd_work *work, file_info->IndexNumber = cpu_to_le64(stat.ino); file_info->EASize = 0; file_info->AccessFlags = fp->daccess; - file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos); + if (ksmbd_stream_fd(fp) == false) + file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos); + else + file_info->CurrentByteOffset = cpu_to_le64(fp->stream.pos); file_info->Mode = fp->coption; file_info->AlignmentRequirement = 0; conv_len = smbConvertToUTF16((__le16 *)file_info->FileName, filename, @@ -5118,8 +5147,13 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp, time = ksmbd_UnixTimeToNT(stat.ctime); file_info->ChangeTime = cpu_to_le64(time); file_info->Attributes = fp->f_ci->m_fattr; - file_info->AllocationSize = cpu_to_le64(stat.blocks << 9); - file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); + if (ksmbd_stream_fd(fp) == false) { + file_info->AllocationSize = cpu_to_le64(stat.blocks << 9); + file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); + } else { + file_info->AllocationSize = cpu_to_le64(fp->stream.size); + file_info->EndOfFile = cpu_to_le64(fp->stream.size); + } file_info->Reserved = cpu_to_le32(0); rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_ntwrk_info)); @@ -5142,7 +5176,11 @@ static void get_file_position_info(struct smb2_query_info_rsp *rsp, struct smb2_file_pos_info *file_info; file_info = (struct smb2_file_pos_info *)rsp->Buffer; - file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos); + if (ksmbd_stream_fd(fp) == false) + file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos); + else + file_info->CurrentByteOffset = cpu_to_le64(fp->stream.pos); + rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_pos_info)); } @@ -5231,8 +5269,13 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp, file_info->ChangeTime = cpu_to_le64(time); file_info->DosAttributes = fp->f_ci->m_fattr; file_info->Inode = cpu_to_le64(stat.ino); - file_info->EndOfFile = cpu_to_le64(stat.size); - file_info->AllocationSize = cpu_to_le64(stat.blocks << 9); + if (ksmbd_stream_fd(fp) == false) { + file_info->EndOfFile = cpu_to_le64(stat.size); + file_info->AllocationSize = cpu_to_le64(stat.blocks << 9); + } else { + file_info->EndOfFile = cpu_to_le64(fp->stream.size); + file_info->AllocationSize = cpu_to_le64(fp->stream.size); + } file_info->HardLinks = cpu_to_le32(stat.nlink); file_info->Mode = cpu_to_le32(stat.mode & 0777); switch (stat.mode & S_IFMT) { @@ -6008,8 +6051,7 @@ static int smb2_create_link(struct ksmbd_work *work, struct nls_table *local_nls) { char *link_name = NULL, *target_name = NULL, *pathname = NULL; - struct path path, parent_path; - bool file_present = false; + struct path path; int rc; if (buf_len < (u64)sizeof(struct smb2_file_link_info) + @@ -6038,15 +6080,12 @@ static int smb2_create_link(struct ksmbd_work *work, ksmbd_debug(SMB, "target name is %s\n", target_name); rc = ksmbd_vfs_kern_path_locked(work, link_name, LOOKUP_NO_SYMLINKS, - &parent_path, &path, 0); + &path, 0); if (rc) { if (rc != -ENOENT) goto out; - } else - file_present = true; - - if (file_info->ReplaceIfExists) { - if (file_present) { + } else { + if (file_info->ReplaceIfExists) { rc = ksmbd_vfs_remove_file(work, &path); if (rc) { rc = -EINVAL; @@ -6054,21 +6093,17 @@ static int smb2_create_link(struct ksmbd_work *work, link_name); goto out; } - } - } else { - if (file_present) { + } else { rc = -EEXIST; ksmbd_debug(SMB, "link already exists\n"); goto out; } + ksmbd_vfs_kern_path_unlock(&path); } - rc = ksmbd_vfs_link(work, target_name, link_name); if (rc) rc = -EINVAL; out: - if (file_present) - ksmbd_vfs_kern_path_unlock(&parent_path, &path); if (!IS_ERR(link_name)) kfree(link_name); @@ -6174,6 +6209,9 @@ static int set_file_allocation_info(struct ksmbd_work *work, if (!(fp->daccess & FILE_WRITE_DATA_LE)) return -EACCES; + if (ksmbd_stream_fd(fp) == true) + return 0; + rc = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); if (rc) @@ -6232,7 +6270,8 @@ static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp, * truncate of some filesystem like FAT32 fill zero data in * truncated range. */ - if (inode->i_sb->s_magic != MSDOS_SUPER_MAGIC) { + if (inode->i_sb->s_magic != MSDOS_SUPER_MAGIC && + ksmbd_stream_fd(fp) == false) { ksmbd_debug(SMB, "truncated to newsize %lld\n", newsize); rc = ksmbd_vfs_truncate(work, fp, newsize); if (rc) { @@ -6305,7 +6344,13 @@ static int set_file_position_info(struct ksmbd_file *fp, return -EINVAL; } - fp->filp->f_pos = current_byte_offset; + if (ksmbd_stream_fd(fp) == false) + fp->filp->f_pos = current_byte_offset; + else { + if (current_byte_offset > XATTR_SIZE_MAX) + current_byte_offset = XATTR_SIZE_MAX; + fp->stream.pos = current_byte_offset; + } return 0; } @@ -7793,7 +7838,7 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, if (!ksmbd_find_netdev_name_iface_list(netdev->name)) continue; - flags = dev_get_flags(netdev); + flags = netif_get_flags(netdev); if (!(flags & IFF_RUNNING)) continue; ipv6_retry: @@ -8519,11 +8564,6 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work) goto err_out; } - opinfo->op_state = OPLOCK_STATE_NONE; - wake_up_interruptible_all(&opinfo->oplock_q); - opinfo_put(opinfo); - ksmbd_fd_put(work, fp); - rsp->StructureSize = cpu_to_le16(24); rsp->OplockLevel = rsp_oplevel; rsp->Reserved = 0; @@ -8531,16 +8571,15 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work) rsp->VolatileFid = volatile_id; rsp->PersistentFid = persistent_id; ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_oplock_break)); - if (!ret) - return; - + if (ret) { err_out: + smb2_set_err_rsp(work); + } + opinfo->op_state = OPLOCK_STATE_NONE; wake_up_interruptible_all(&opinfo->oplock_q); - opinfo_put(opinfo); ksmbd_fd_put(work, fp); - smb2_set_err_rsp(work); } static int check_lease_state(struct lease *lease, __le32 req_state) @@ -8670,11 +8709,6 @@ static void smb21_lease_break_ack(struct ksmbd_work *work) } lease_state = lease->state; - opinfo->op_state = OPLOCK_STATE_NONE; - wake_up_interruptible_all(&opinfo->oplock_q); - atomic_dec(&opinfo->breaking_cnt); - wake_up_interruptible_all(&opinfo->oplock_brk); - opinfo_put(opinfo); rsp->StructureSize = cpu_to_le16(36); rsp->Reserved = 0; @@ -8683,16 +8717,16 @@ static void smb21_lease_break_ack(struct ksmbd_work *work) rsp->LeaseState = lease_state; rsp->LeaseDuration = 0; ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_lease_ack)); - if (!ret) - return; - + if (ret) { err_out: + smb2_set_err_rsp(work); + } + + opinfo->op_state = OPLOCK_STATE_NONE; wake_up_interruptible_all(&opinfo->oplock_q); atomic_dec(&opinfo->breaking_cnt); wake_up_interruptible_all(&opinfo->oplock_brk); - opinfo_put(opinfo); - smb2_set_err_rsp(work); } /** diff --git a/fs/smb/server/smb2pdu.h b/fs/smb/server/smb2pdu.h index 17a0b18a8406..16ae8a10490b 100644 --- a/fs/smb/server/smb2pdu.h +++ b/fs/smb/server/smb2pdu.h @@ -63,6 +63,9 @@ struct preauth_integrity_info { #define SMB2_SESSION_TIMEOUT (10 * HZ) +/* Apple Defined Contexts */ +#define SMB2_CREATE_AAPL "AAPL" + struct create_durable_req_v2 { struct create_context_hdr ccontext; __u8 Name[8]; diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 4998df04ab95..c6cbe0d56e32 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -159,7 +159,8 @@ struct smb_direct_transport { }; #define KSMBD_TRANS(t) ((struct ksmbd_transport *)&((t)->transport)) - +#define SMBD_TRANS(t) ((struct smb_direct_transport *)container_of(t, \ + struct smb_direct_transport, transport)) enum { SMB_DIRECT_MSG_NEGOTIATE_REQ = 0, SMB_DIRECT_MSG_DATA_TRANSFER @@ -410,6 +411,11 @@ err: return NULL; } +static void smb_direct_free_transport(struct ksmbd_transport *kt) +{ + kfree(SMBD_TRANS(kt)); +} + static void free_transport(struct smb_direct_transport *t) { struct smb_direct_recvmsg *recvmsg; @@ -427,7 +433,8 @@ static void free_transport(struct smb_direct_transport *t) if (t->qp) { ib_drain_qp(t->qp); ib_mr_pool_destroy(t->qp, &t->qp->rdma_mrs); - ib_destroy_qp(t->qp); + t->qp = NULL; + rdma_destroy_qp(t->cm_id); } ksmbd_debug(RDMA, "drain the reassembly queue\n"); @@ -455,7 +462,6 @@ static void free_transport(struct smb_direct_transport *t) smb_direct_destroy_pools(t); ksmbd_conn_free(KSMBD_TRANS(t)->conn); - kfree(t); } static struct smb_direct_sendmsg @@ -1935,8 +1941,8 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, return 0; err: if (t->qp) { - ib_destroy_qp(t->qp); t->qp = NULL; + rdma_destroy_qp(t->cm_id); } if (t->recv_cq) { ib_destroy_cq(t->recv_cq); @@ -2281,4 +2287,5 @@ static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops = { .read = smb_direct_read, .rdma_read = smb_direct_rdma_read, .rdma_write = smb_direct_rdma_write, + .free_transport = smb_direct_free_transport, }; diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index abedf510899a..f8c772a7cb43 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -58,12 +58,10 @@ static inline void ksmbd_tcp_reuseaddr(struct socket *sock) static inline void ksmbd_tcp_rcv_timeout(struct socket *sock, s64 secs) { - lock_sock(sock->sk); if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) - sock->sk->sk_rcvtimeo = secs * HZ; + WRITE_ONCE(sock->sk->sk_rcvtimeo, secs * HZ); else - sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; - release_sock(sock->sk); + WRITE_ONCE(sock->sk->sk_rcvtimeo, MAX_SCHEDULE_TIMEOUT); } static inline void ksmbd_tcp_snd_timeout(struct socket *sock, s64 secs) @@ -93,7 +91,7 @@ static struct tcp_transport *alloc_transport(struct socket *client_sk) return t; } -void ksmbd_free_transport(struct ksmbd_transport *kt) +static void ksmbd_tcp_free_transport(struct ksmbd_transport *kt) { struct tcp_transport *t = TCP_TRANS(kt); @@ -656,4 +654,5 @@ static const struct ksmbd_transport_ops ksmbd_tcp_transport_ops = { .read = ksmbd_tcp_read, .writev = ksmbd_tcp_writev, .disconnect = ksmbd_tcp_disconnect, + .free_transport = ksmbd_tcp_free_transport, }; diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index baf0d3031a44..04539037108c 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -4,6 +4,7 @@ * Copyright (C) 2018 Samsung Electronics Co., Ltd. */ +#include <crypto/sha2.h> #include <linux/kernel.h> #include <linux/fs.h> #include <linux/filelock.h> @@ -65,13 +66,12 @@ int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child) return 0; } -static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf, - char *pathname, unsigned int flags, - struct path *parent_path, - struct path *path) +static int ksmbd_vfs_path_lookup(struct ksmbd_share_config *share_conf, + char *pathname, unsigned int flags, + struct path *path, bool do_lock) { struct qstr last; - struct filename *filename; + struct filename *filename __free(putname) = NULL; struct path *root_share_path = &share_conf->vfs_path; int err, type; struct dentry *d; @@ -88,51 +88,57 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf, return PTR_ERR(filename); err = vfs_path_parent_lookup(filename, flags, - parent_path, &last, &type, + path, &last, &type, root_share_path); - if (err) { - putname(filename); + if (err) return err; - } if (unlikely(type != LAST_NORM)) { - path_put(parent_path); - putname(filename); + path_put(path); return -ENOENT; } - err = mnt_want_write(parent_path->mnt); - if (err) { - path_put(parent_path); - putname(filename); + if (do_lock) { + err = mnt_want_write(path->mnt); + if (err) { + path_put(path); + return -ENOENT; + } + + inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); + d = lookup_one_qstr_excl(&last, path->dentry, 0); + + if (!IS_ERR(d)) { + dput(path->dentry); + path->dentry = d; + return 0; + } + inode_unlock(path->dentry->d_inode); + mnt_drop_write(path->mnt); + path_put(path); return -ENOENT; } - inode_lock_nested(parent_path->dentry->d_inode, I_MUTEX_PARENT); - d = lookup_one_qstr_excl(&last, parent_path->dentry, 0); - if (IS_ERR(d)) - goto err_out; - + d = lookup_noperm_unlocked(&last, path->dentry); + if (!IS_ERR(d) && d_is_negative(d)) { + dput(d); + d = ERR_PTR(-ENOENT); + } + if (IS_ERR(d)) { + path_put(path); + return -ENOENT; + } + dput(path->dentry); path->dentry = d; - path->mnt = mntget(parent_path->mnt); if (test_share_config_flag(share_conf, KSMBD_SHARE_FLAG_CROSSMNT)) { err = follow_down(path, 0); if (err < 0) { path_put(path); - goto err_out; + return -ENOENT; } } - - putname(filename); return 0; - -err_out: - inode_unlock(d_inode(parent_path->dentry)); - mnt_drop_write(parent_path->mnt); - path_put(parent_path); - putname(filename); - return -ENOENT; } void ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap, @@ -292,6 +298,7 @@ static int ksmbd_vfs_stream_read(struct ksmbd_file *fp, char *buf, loff_t *pos, if (v_len - *pos < count) count = v_len - *pos; + fp->stream.pos = v_len; memcpy(buf, &stream_buf[*pos], count); @@ -455,8 +462,8 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, true); if (err < 0) goto out; - - fp->filp->f_pos = *pos; + else + fp->stream.pos = size; err = 0; out: kvfree(stream_buf); @@ -546,7 +553,8 @@ int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat) { int err; - err = vfs_getattr(path, stat, STATX_BTIME, AT_STATX_SYNC_AS_STAT); + err = vfs_getattr(path, stat, STATX_BASIC_STATS | STATX_BTIME, + AT_STATX_SYNC_AS_STAT); if (err) pr_err("getattr failed, err %d\n", err); return err; @@ -763,10 +771,10 @@ retry: } rd.old_mnt_idmap = mnt_idmap(old_path->mnt), - rd.old_dir = d_inode(old_parent), + rd.old_parent = old_parent, rd.old_dentry = old_child, rd.new_mnt_idmap = mnt_idmap(new_path.mnt), - rd.new_dir = new_path.dentry->d_inode, + rd.new_parent = new_path.dentry, rd.new_dentry = new_dentry, rd.flags = flags, rd.delegated_inode = NULL, @@ -1196,103 +1204,114 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name, return ret; } -/** - * ksmbd_vfs_kern_path_locked() - lookup a file and get path info - * @work: work - * @name: file path that is relative to share - * @flags: lookup flags - * @parent_path: if lookup succeed, return parent_path info - * @path: if lookup succeed, return path info - * @caseless: caseless filename lookup - * - * Return: 0 on success, otherwise error - */ -int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, - unsigned int flags, struct path *parent_path, - struct path *path, bool caseless) +static +int __ksmbd_vfs_kern_path(struct ksmbd_work *work, char *filepath, + unsigned int flags, + struct path *path, bool caseless, bool do_lock) { struct ksmbd_share_config *share_conf = work->tcon->share_conf; + struct path parent_path; + size_t path_len, remain_len; int err; - err = ksmbd_vfs_path_lookup_locked(share_conf, name, flags, parent_path, - path); - if (!err) - return 0; - - if (caseless) { - char *filepath; - size_t path_len, remain_len; - - filepath = name; - path_len = strlen(filepath); - remain_len = path_len; - - *parent_path = share_conf->vfs_path; - path_get(parent_path); - - while (d_can_lookup(parent_path->dentry)) { - char *filename = filepath + path_len - remain_len; - char *next = strchrnul(filename, '/'); - size_t filename_len = next - filename; - bool is_last = !next[0]; +retry: + err = ksmbd_vfs_path_lookup(share_conf, filepath, flags, path, do_lock); + if (!err || !caseless) + return err; - if (filename_len == 0) - break; + path_len = strlen(filepath); + remain_len = path_len; - err = ksmbd_vfs_lookup_in_dir(parent_path, filename, - filename_len, - work->conn->um); - if (err) - goto out2; + parent_path = share_conf->vfs_path; + path_get(&parent_path); - next[0] = '\0'; + while (d_can_lookup(parent_path.dentry)) { + char *filename = filepath + path_len - remain_len; + char *next = strchrnul(filename, '/'); + size_t filename_len = next - filename; + bool is_last = !next[0]; - err = vfs_path_lookup(share_conf->vfs_path.dentry, - share_conf->vfs_path.mnt, - filepath, - flags, - path); - if (!is_last) - next[0] = '/'; - if (err) - goto out2; - else if (is_last) - goto out1; - path_put(parent_path); - *parent_path = *path; + if (filename_len == 0) + break; - remain_len -= filename_len + 1; + err = ksmbd_vfs_lookup_in_dir(&parent_path, filename, + filename_len, + work->conn->um); + path_put(&parent_path); + if (err) + goto out; + if (is_last) { + caseless = false; + goto retry; } + next[0] = '\0'; + + err = vfs_path_lookup(share_conf->vfs_path.dentry, + share_conf->vfs_path.mnt, + filepath, + flags, + &parent_path); + next[0] = '/'; + if (err) + goto out; - err = -EINVAL; -out2: - path_put(parent_path); + remain_len -= filename_len + 1; } -out1: - if (!err) { - err = mnt_want_write(parent_path->mnt); - if (err) { - path_put(path); - path_put(parent_path); - return err; - } - - err = ksmbd_vfs_lock_parent(parent_path->dentry, path->dentry); - if (err) { - path_put(path); - path_put(parent_path); - } - } + err = -EINVAL; + path_put(&parent_path); +out: return err; } -void ksmbd_vfs_kern_path_unlock(struct path *parent_path, struct path *path) +/** + * ksmbd_vfs_kern_path() - lookup a file and get path info + * @work: work + * @filepath: file path that is relative to share + * @flags: lookup flags + * @path: if lookup succeed, return path info + * @caseless: caseless filename lookup + * + * Perform the lookup, possibly crossing over any mount point. + * On return no locks will be held and write-access to filesystem + * won't have been checked. + * Return: 0 if file was found, otherwise error + */ +int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *filepath, + unsigned int flags, + struct path *path, bool caseless) +{ + return __ksmbd_vfs_kern_path(work, filepath, flags, path, + caseless, false); +} + +/** + * ksmbd_vfs_kern_path_locked() - lookup a file and get path info + * @work: work + * @filepath: file path that is relative to share + * @flags: lookup flags + * @path: if lookup succeed, return path info + * @caseless: caseless filename lookup + * + * Perform the lookup, but don't cross over any mount point. + * On return the parent of path->dentry will be locked and write-access to + * filesystem will have been gained. + * Return: 0 on if file was found, otherwise error + */ +int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *filepath, + unsigned int flags, + struct path *path, bool caseless) +{ + return __ksmbd_vfs_kern_path(work, filepath, flags, path, + caseless, true); +} + +void ksmbd_vfs_kern_path_unlock(struct path *path) { - inode_unlock(d_inode(parent_path->dentry)); - mnt_drop_write(parent_path->mnt); + /* While lock is still held, ->d_parent is safe */ + inode_unlock(d_inode(path->dentry->d_parent)); + mnt_drop_write(path->mnt); path_put(path); - path_put(parent_path); } struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, @@ -1476,11 +1495,7 @@ int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, acl.sd_buf = (char *)pntsd; acl.sd_size = len; - rc = ksmbd_gen_sd_hash(conn, acl.sd_buf, acl.sd_size, acl.hash); - if (rc) { - pr_err("failed to generate hash for ndr acl\n"); - return rc; - } + sha256(acl.sd_buf, acl.sd_size, acl.hash); smb_acl = ksmbd_vfs_make_xattr_posix_acl(idmap, inode, ACL_TYPE_ACCESS); @@ -1495,12 +1510,7 @@ int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, goto out; } - rc = ksmbd_gen_sd_hash(conn, acl_ndr.data, acl_ndr.offset, - acl.posix_acl_hash); - if (rc) { - pr_err("failed to generate hash for ndr acl\n"); - goto out; - } + sha256(acl_ndr.data, acl_ndr.offset, acl.posix_acl_hash); rc = ndr_encode_v4_ntacl(&sd_ndr, &acl); if (rc) { @@ -1557,11 +1567,7 @@ int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn, goto out_free; } - rc = ksmbd_gen_sd_hash(conn, acl_ndr.data, acl_ndr.offset, cmp_hash); - if (rc) { - pr_err("failed to generate hash for ndr acl\n"); - goto out_free; - } + sha256(acl_ndr.data, acl_ndr.offset, cmp_hash); if (memcmp(cmp_hash, acl.posix_acl_hash, XATTR_SD_HASH_SIZE)) { pr_err("hash value diff\n"); diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h index 2893f59803a6..d47472f3e30b 100644 --- a/fs/smb/server/vfs.h +++ b/fs/smb/server/vfs.h @@ -117,10 +117,13 @@ int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, const struct path *path, char *attr_name, bool get_write); +int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name, + unsigned int flags, + struct path *path, bool caseless); int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, - unsigned int flags, struct path *parent_path, + unsigned int flags, struct path *path, bool caseless); -void ksmbd_vfs_kern_path_unlock(struct path *parent_path, struct path *path); +void ksmbd_vfs_kern_path_unlock(struct path *path); struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, const char *name, unsigned int flags, diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h index 5bbb179736c2..0708155b5caf 100644 --- a/fs/smb/server/vfs_cache.h +++ b/fs/smb/server/vfs_cache.h @@ -44,6 +44,7 @@ struct ksmbd_lock { struct stream { char *name; ssize_t size; + loff_t pos; }; struct ksmbd_inode { diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index b1091e70434a..a9602aae21ef 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -149,6 +149,27 @@ config SQUASHFS_XATTR If unsure, say N. +config SQUASHFS_COMP_CACHE_FULL + bool "Enable full caching of compressed blocks" + depends on SQUASHFS + default n + help + This option enables caching of all compressed blocks, Without caching, + repeated reads of the same files trigger excessive disk I/O, significantly + reducinng performance in workloads like fio-based benchmarks. + + For example, fio tests (iodepth=1, numjobs=1, ioengine=psync) show: + With caching: IOPS=2223, BW=278MiB/s (291MB/s) + Without caching: IOPS=815, BW=102MiB/s (107MB/s) + + Enabling this option restores performance to pre-regression levels by + caching all compressed blocks in the page cache, reducing disk I/O for + repeated reads. However, this increases memory usage, which may be a + concern in memory-constrained environments. + + Enable this option if your workload involves frequent repeated reads and + memory usage is not a limiting factor. If unsure, say N. + config SQUASHFS_ZLIB bool "Include support for ZLIB compressed file systems" depends on SQUASHFS diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 2dc730800f44..3061043e915c 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -88,6 +88,10 @@ static int squashfs_bio_read_cached(struct bio *fullbio, struct bio_vec *bv; int idx = 0; int err = 0; +#ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL + struct page **cache_pages = kmalloc_array(page_count, + sizeof(void *), GFP_KERNEL | __GFP_ZERO); +#endif bio_for_each_segment_all(bv, fullbio, iter_all) { struct page *page = bv->bv_page; @@ -110,6 +114,11 @@ static int squashfs_bio_read_cached(struct bio *fullbio, head_to_cache = page; else if (idx == page_count - 1 && index + length != read_end) tail_to_cache = page; +#ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL + /* Cache all pages in the BIO for repeated reads */ + else if (cache_pages) + cache_pages[idx] = page; +#endif if (!bio || idx != end_idx) { struct bio *new = bio_alloc_clone(bdev, fullbio, @@ -163,6 +172,25 @@ static int squashfs_bio_read_cached(struct bio *fullbio, } } +#ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL + if (!cache_pages) + goto out; + + for (idx = 0; idx < page_count; idx++) { + if (!cache_pages[idx]) + continue; + int ret = add_to_page_cache_lru(cache_pages[idx], cache_mapping, + (read_start >> PAGE_SHIFT) + idx, + GFP_NOIO); + + if (!ret) { + SetPageUptodate(cache_pages[idx]); + unlock_page(cache_pages[idx]); + } + } + kfree(cache_pages); +out: +#endif return 0; } diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 67c55fe32ce8..992ea0e37257 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -202,6 +202,11 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) msblk->panic_on_errors = (opts->errors == Opt_errors_panic); msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); + if (!msblk->devblksize) { + errorf(fc, "squashfs: unable to set blocksize\n"); + return -EINVAL; + } + msblk->devblksize_log2 = ffz(~msblk->devblksize); mutex_init(&msblk->meta_index_mutex); diff --git a/fs/stack.c b/fs/stack.c index f18920119944..d8c782e064e3 100644 --- a/fs/stack.c +++ b/fs/stack.c @@ -3,7 +3,7 @@ #include <linux/fs.h> #include <linux/fs_stack.h> -/* does _NOT_ require i_mutex to be held. +/* does _NOT_ require i_rwsem to be held. * * This function cannot be inlined since i_size_{read,write} is rather * heavy-weight on 32-bit systems @@ -41,7 +41,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src) * If CONFIG_SMP or CONFIG_PREEMPTION on 32-bit, it's vital for * fsstack_copy_inode_size() to hold some lock around * i_size_write(), otherwise i_size_read() may spin forever (see - * include/linux/fs.h). We don't necessarily hold i_mutex when this + * include/linux/fs.h). We don't necessarily hold i_rwsem when this * is called, so take i_lock for that case. * * And if on 32-bit, continue our effort to keep the two halves of diff --git a/fs/super.c b/fs/super.c index 21799e213fd7..7f876f32343a 100644 --- a/fs/super.c +++ b/fs/super.c @@ -964,8 +964,10 @@ void iterate_supers_type(struct file_system_type *type, spin_unlock(&sb_lock); locked = super_lock_shared(sb); - if (locked) + if (locked) { f(sb, arg); + super_unlock_shared(sb); + } spin_lock(&sb_lock); if (p) @@ -1457,6 +1459,17 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) if (!sb) return; + if (sb->s_op->remove_bdev) { + int ret; + + ret = sb->s_op->remove_bdev(sb, bdev); + if (!ret) { + super_unlock_shared(sb); + return; + } + /* Fallback to shutdown. */ + } + if (!surprise) sync_filesystem(sb); shrink_dcache_sb(sb); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index c3d3b079aedd..1ca143d2f22a 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -83,7 +83,7 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v) static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf, size_t count, loff_t pos) { - struct bin_attribute *battr = of->kn->priv; + const struct bin_attribute *battr = of->kn->priv; struct kobject *kobj = sysfs_file_kobj(of->kn); loff_t size = file_inode(of->file)->i_size; @@ -149,7 +149,7 @@ static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf, static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf, size_t count, loff_t pos) { - struct bin_attribute *battr = of->kn->priv; + const struct bin_attribute *battr = of->kn->priv; struct kobject *kobj = sysfs_file_kobj(of->kn); loff_t size = file_inode(of->file)->i_size; @@ -173,7 +173,7 @@ static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf, static int sysfs_kf_bin_mmap(struct kernfs_open_file *of, struct vm_area_struct *vma) { - struct bin_attribute *battr = of->kn->priv; + const struct bin_attribute *battr = of->kn->priv; struct kobject *kobj = sysfs_file_kobj(of->kn); return battr->mmap(of->file, kobj, battr, vma); @@ -182,7 +182,7 @@ static int sysfs_kf_bin_mmap(struct kernfs_open_file *of, static loff_t sysfs_kf_bin_llseek(struct kernfs_open_file *of, loff_t offset, int whence) { - struct bin_attribute *battr = of->kn->priv; + const struct bin_attribute *battr = of->kn->priv; struct kobject *kobj = sysfs_file_kobj(of->kn); if (battr->llseek) @@ -193,7 +193,7 @@ static loff_t sysfs_kf_bin_llseek(struct kernfs_open_file *of, loff_t offset, static int sysfs_kf_bin_open(struct kernfs_open_file *of) { - struct bin_attribute *battr = of->kn->priv; + const struct bin_attribute *battr = of->kn->priv; if (battr->f_mapping) of->file->f_mapping = battr->f_mapping(); diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index a3fd3cc591bd..0c023941a316 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -465,9 +465,20 @@ static int tracefs_d_revalidate(struct inode *inode, const struct qstr *name, return !(ei && ei->is_freed); } +static int tracefs_d_delete(const struct dentry *dentry) +{ + /* + * We want to keep eventfs dentries around but not tracefs + * ones. eventfs dentries have content in d_fsdata. + * Use d_fsdata to determine if it's a eventfs dentry or not. + */ + return dentry->d_fsdata == NULL; +} + static const struct dentry_operations tracefs_dentry_operations = { .d_revalidate = tracefs_d_revalidate, .d_release = tracefs_d_release, + .d_delete = tracefs_d_delete, }; static int tracefs_fill_super(struct super_block *sb, struct fs_context *fc) @@ -480,7 +491,7 @@ static int tracefs_fill_super(struct super_block *sb, struct fs_context *fc) return err; sb->s_op = &tracefs_super_operations; - sb->s_d_op = &tracefs_dentry_operations; + set_default_d_op(sb, &tracefs_dentry_operations); return 0; } @@ -551,20 +562,9 @@ struct dentry *tracefs_start_creating(const char *name, struct dentry *parent) if (!parent) parent = tracefs_mount->mnt_root; - inode_lock(d_inode(parent)); - if (unlikely(IS_DEADDIR(d_inode(parent)))) - dentry = ERR_PTR(-ENOENT); - else - dentry = lookup_noperm(&QSTR(name), parent); - if (!IS_ERR(dentry) && d_inode(dentry)) { - dput(dentry); - dentry = ERR_PTR(-EEXIST); - } - - if (IS_ERR(dentry)) { - inode_unlock(d_inode(parent)); + dentry = simple_start_creating(parent, name); + if (IS_ERR(dentry)) simple_release_fs(&tracefs_mount, &tracefs_mount_count); - } return dentry; } diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index 921f9033d0d2..fb5ac358077b 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -51,7 +51,7 @@ int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn, memset(p + in_len, 0, pad_len - in_len); err = fscrypt_encrypt_block_inplace(inode, virt_to_page(p), pad_len, - offset_in_page(p), block, GFP_NOFS); + offset_in_page(p), block); if (err) { ubifs_err(c, "fscrypt_encrypt_block_inplace() failed: %d", err); return err; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index bf311c38d9a8..e75a6cec67be 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -404,7 +404,8 @@ static int allocate_budget(struct ubifs_info *c, struct folio *folio, * there is a plenty of flash space and the budget will be acquired quickly, * without forcing write-back. The slow path does not make this assumption. */ -static int ubifs_write_begin(struct file *file, struct address_space *mapping, +static int ubifs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -514,8 +515,9 @@ static void cancel_budget(struct ubifs_info *c, struct folio *folio, } } -static int ubifs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, +static int ubifs_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, + unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; @@ -977,8 +979,7 @@ static int do_writepage(struct folio *folio, size_t len) * on the page lock and it would not write the truncated inode node to the * journal before we have finished. */ -static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, - void *data) +static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc) { struct inode *inode = folio->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -1050,7 +1051,12 @@ out_unlock: static int ubifs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - return write_cache_pages(mapping, wbc, ubifs_writepage, NULL); + struct folio *folio = NULL; + int error; + + while ((folio = writeback_iter(mapping, wbc, folio, &error))) + error = ubifs_writepage(folio, wbc); + return error; } /** @@ -1579,17 +1585,17 @@ static const struct vm_operations_struct ubifs_file_vm_ops = { .page_mkwrite = ubifs_vm_page_mkwrite, }; -static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) +static int ubifs_file_mmap_prepare(struct vm_area_desc *desc) { int err; - err = generic_file_mmap(file, vma); + err = generic_file_mmap_prepare(desc); if (err) return err; - vma->vm_ops = &ubifs_file_vm_ops; + desc->vm_ops = &ubifs_file_vm_ops; if (IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT)) - file_accessed(file); + file_accessed(desc->file); return 0; } @@ -1652,7 +1658,7 @@ const struct file_operations ubifs_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = ubifs_write_iter, - .mmap = ubifs_file_mmap, + .mmap_prepare = ubifs_file_mmap_prepare, .fsync = ubifs_fsync, .unlocked_ioctl = ubifs_ioctl, .splice_read = filemap_splice_read, diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 2c99349cf537..79536b2e3d7a 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -130,7 +130,7 @@ static int setflags(struct inode *inode, int flags) return err; } -int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int ubifs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); int flags = ubifs2ioctl(ubifs_inode(inode)->flags); @@ -145,7 +145,7 @@ int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa) } int ubifs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); int flags = fa->flags; diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index ee954e64ce7f..e28ab4395e5c 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -985,7 +985,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink); if (kill_xattrs && ui->xattr_cnt > ubifs_xattr_max_cnt(c)) { - ubifs_err(c, "Cannot delete inode, it has too much xattrs!"); + ubifs_err(c, "Cannot delete inode, it has too many xattrs!"); err = -EPERM; ubifs_ro_mode(c, err); return err; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 256dbaeeb0de..5db45c9e26ee 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -2073,9 +2073,9 @@ int ubifs_recover_size(struct ubifs_info *c, bool in_place); void ubifs_destroy_size_tree(struct ubifs_info *c); /* ioctl.c */ -int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int ubifs_fileattr_get(struct dentry *dentry, struct file_kattr *fa); int ubifs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); void ubifs_set_inode_flags(struct inode *inode); #ifdef CONFIG_COMPAT diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 4386dd845e40..f24aa98e6869 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -181,19 +181,23 @@ static void udf_write_failed(struct address_space *mapping, loff_t to) } } -static int udf_adinicb_writepage(struct folio *folio, - struct writeback_control *wbc, void *data) +static int udf_adinicb_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - struct inode *inode = folio->mapping->host; + struct inode *inode = mapping->host; struct udf_inode_info *iinfo = UDF_I(inode); + struct folio *folio = NULL; + int error = 0; + + while ((folio = writeback_iter(mapping, wbc, folio, &error))) { + BUG_ON(!folio_test_locked(folio)); + BUG_ON(folio->index != 0); + memcpy_from_file_folio(iinfo->i_data + iinfo->i_lenEAttr, folio, + 0, i_size_read(inode)); + folio_unlock(folio); + } - BUG_ON(!folio_test_locked(folio)); - BUG_ON(folio->index != 0); - memcpy_from_file_folio(iinfo->i_data + iinfo->i_lenEAttr, folio, 0, - i_size_read(inode)); - folio_unlock(folio); mark_inode_dirty(inode); - return 0; } @@ -203,9 +207,9 @@ static int udf_writepages(struct address_space *mapping, struct inode *inode = mapping->host; struct udf_inode_info *iinfo = UDF_I(inode); - if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) - return mpage_writepages(mapping, wbc, udf_get_block_wb); - return write_cache_pages(mapping, wbc, udf_adinicb_writepage, NULL); + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + return udf_adinicb_writepages(mapping, wbc); + return mpage_writepages(mapping, wbc, udf_get_block_wb); } static void udf_adinicb_read_folio(struct folio *folio) @@ -244,10 +248,12 @@ static void udf_readahead(struct readahead_control *rac) mpage_readahead(rac, udf_get_block); } -static int udf_write_begin(struct file *file, struct address_space *mapping, +static int udf_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { + struct file *file = iocb->ki_filp; struct udf_inode_info *iinfo = UDF_I(file_inode(file)); struct folio *folio; int ret; @@ -271,15 +277,16 @@ static int udf_write_begin(struct file *file, struct address_space *mapping, return 0; } -static int udf_write_end(struct file *file, struct address_space *mapping, +static int udf_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { - struct inode *inode = file_inode(file); + struct inode *inode = file_inode(iocb->ki_filp); loff_t last_pos; if (UDF_I(inode)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) - return generic_write_end(file, mapping, pos, len, copied, folio, + return generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); last_pos = pos + copied; if (last_pos > inode->i_size) diff --git a/fs/udf/super.c b/fs/udf/super.c index 1c8a736b3309..b2f168b0a0d1 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1440,7 +1440,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, struct genericPartitionMap *gpm; uint16_t ident; struct buffer_head *bh; - unsigned int table_len; + unsigned int table_len, part_map_count; int ret; bh = udf_read_tagged(sb, block, block, &ident); @@ -1461,7 +1461,16 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, "logical volume"); if (ret) goto out_bh; - ret = udf_sb_alloc_partition_maps(sb, le32_to_cpu(lvd->numPartitionMaps)); + + part_map_count = le32_to_cpu(lvd->numPartitionMaps); + if (part_map_count > table_len / sizeof(struct genericPartitionMap1)) { + udf_err(sb, "error loading logical volume descriptor: " + "Too many partition maps (%u > %u)\n", part_map_count, + table_len / (unsigned)sizeof(struct genericPartitionMap1)); + ret = -EIO; + goto out_bh; + } + ret = udf_sb_alloc_partition_maps(sb, part_map_count); if (ret) goto out_bh; diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 88d0062cfdb9..0388a1bae326 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -48,7 +48,7 @@ static void ufs_commit_chunk(struct folio *folio, loff_t pos, unsigned len) struct inode *dir = mapping->host; inode_inc_iversion(dir); - block_write_end(NULL, mapping, pos, len, len, folio, NULL); + block_write_end(pos, len, len, folio); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); mark_inode_dirty(dir); diff --git a/fs/ufs/file.c b/fs/ufs/file.c index 487ad1fc2de6..c2a391c17df7 100644 --- a/fs/ufs/file.c +++ b/fs/ufs/file.c @@ -38,7 +38,7 @@ const struct file_operations ufs_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, - .mmap = generic_file_mmap, + .mmap_prepare = generic_file_mmap_prepare, .open = generic_file_open, .fsync = generic_file_fsync, .splice_read = filemap_splice_read, diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 7dc38fdef2ea..8361c00e8fa6 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -474,9 +474,10 @@ static void ufs_write_failed(struct address_space *mapping, loff_t to) } } -static int ufs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int ufs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; @@ -487,13 +488,14 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping, return ret; } -static int ufs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int ufs_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { int ret; - ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); + ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); if (ret < len) ufs_write_failed(mapping, pos + len); return ret; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index eea718ac66b4..6e4585169f94 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -397,7 +397,7 @@ static int ufs_parse_param(struct fs_context *fc, struct fs_parameter *param) pr_err("ufstype can't be changed during remount\n"); return -EINVAL; } - if (!ctx->flavour) { + if (ctx->flavour) { pr_err("conflicting ufstype options\n"); return -EINVAL; } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 22f4bf956ba1..54c6cc7fe9c6 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -165,14 +165,14 @@ static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) { if (refcount_dec_and_test(&ctx->refcount)) { - VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock)); - VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh)); - VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock)); - VM_BUG_ON(waitqueue_active(&ctx->fault_wqh)); - VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock)); - VM_BUG_ON(waitqueue_active(&ctx->event_wqh)); - VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock)); - VM_BUG_ON(waitqueue_active(&ctx->fd_wqh)); + VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_pending_wqh.lock)); + VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_pending_wqh)); + VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_wqh.lock)); + VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_wqh)); + VM_WARN_ON_ONCE(spin_is_locked(&ctx->event_wqh.lock)); + VM_WARN_ON_ONCE(waitqueue_active(&ctx->event_wqh)); + VM_WARN_ON_ONCE(spin_is_locked(&ctx->fd_wqh.lock)); + VM_WARN_ON_ONCE(waitqueue_active(&ctx->fd_wqh)); mmdrop(ctx->mm); kmem_cache_free(userfaultfd_ctx_cachep, ctx); } @@ -304,7 +304,7 @@ again: goto out; ret = false; - if (!pmd_present(_pmd) || pmd_devmap(_pmd)) + if (!pmd_present(_pmd)) goto out; if (pmd_trans_huge(_pmd)) { @@ -383,12 +383,12 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) if (!ctx) goto out; - BUG_ON(ctx->mm != mm); + VM_WARN_ON_ONCE(ctx->mm != mm); /* Any unrecognized flag is a bug. */ - VM_BUG_ON(reason & ~__VM_UFFD_FLAGS); + VM_WARN_ON_ONCE(reason & ~__VM_UFFD_FLAGS); /* 0 or > 1 flags set is a bug; we expect exactly 1. */ - VM_BUG_ON(!reason || (reason & (reason - 1))); + VM_WARN_ON_ONCE(!reason || (reason & (reason - 1))); if (ctx->features & UFFD_FEATURE_SIGBUS) goto out; @@ -411,12 +411,11 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) * to be sure not to return SIGBUS erroneously on * nowait invocations. */ - BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT); + VM_WARN_ON_ONCE(vmf->flags & FAULT_FLAG_RETRY_NOWAIT); #ifdef CONFIG_DEBUG_VM if (printk_ratelimit()) { - printk(KERN_WARNING - "FAULT_FLAG_ALLOW_RETRY missing %x\n", - vmf->flags); + pr_warn("FAULT_FLAG_ALLOW_RETRY missing %x\n", + vmf->flags); dump_stack(); } #endif @@ -602,7 +601,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, */ out: atomic_dec(&ctx->mmap_changing); - VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0); + VM_WARN_ON_ONCE(atomic_read(&ctx->mmap_changing) < 0); userfaultfd_ctx_put(ctx); } @@ -710,7 +709,7 @@ void dup_userfaultfd_fail(struct list_head *fcs) struct userfaultfd_ctx *ctx = fctx->new; atomic_dec(&octx->mmap_changing); - VM_BUG_ON(atomic_read(&octx->mmap_changing) < 0); + VM_WARN_ON_ONCE(atomic_read(&octx->mmap_changing) < 0); userfaultfd_ctx_put(octx); userfaultfd_ctx_put(ctx); @@ -751,11 +750,6 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx, if (!ctx) return; - if (to & ~PAGE_MASK) { - userfaultfd_ctx_put(ctx); - return; - } - msg_init(&ewq.msg); ewq.msg.event = UFFD_EVENT_REMAP; @@ -766,6 +760,16 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx, userfaultfd_event_wait_completion(ctx, &ewq); } +void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *vm_ctx) +{ + struct userfaultfd_ctx *ctx = vm_ctx->ctx; + + if (!ctx) + return; + + userfaultfd_ctx_put(ctx); +} + bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end) { @@ -1243,7 +1247,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, int ret; struct uffdio_register uffdio_register; struct uffdio_register __user *user_uffdio_register; - unsigned long vm_flags; + vm_flags_t vm_flags; bool found; bool basic_ioctls; unsigned long start, end; @@ -1317,8 +1321,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ - !!(cur->vm_flags & __VM_UFFD_FLAGS)); + VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^ + !!(cur->vm_flags & __VM_UFFD_FLAGS)); /* check not compatible vmas */ ret = -EINVAL; @@ -1372,7 +1376,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, found = true; } for_each_vma_range(vmi, cur, end); - BUG_ON(!found); + VM_WARN_ON_ONCE(!found); ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end, wp_async); @@ -1464,8 +1468,16 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ - !!(cur->vm_flags & __VM_UFFD_FLAGS)); + VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^ + !!(cur->vm_flags & __VM_UFFD_FLAGS)); + + /* + * Prevent unregistering through a different userfaultfd than + * the one used for registration. + */ + if (cur->vm_userfaultfd_ctx.ctx && + cur->vm_userfaultfd_ctx.ctx != ctx) + goto out_unlock; /* * Check not compatible vmas, not strictly required @@ -1479,7 +1491,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, found = true; } for_each_vma_range(vmi, cur, end); - BUG_ON(!found); + VM_WARN_ON_ONCE(!found); vma_iter_set(&vmi, start); prev = vma_prev(&vmi); @@ -1490,16 +1502,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, for_each_vma_range(vmi, vma, end) { cond_resched(); - BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async)); - - /* - * Nothing to do: this vma is already registered into this - * userfaultfd and with the right tracking mode too. - */ + /* VMA not registered with userfaultfd. */ if (!vma->vm_userfaultfd_ctx.ctx) goto skip; - WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); + VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx != ctx); + VM_WARN_ON_ONCE(!vma_can_userfault(vma, vma->vm_flags, wp_async)); + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)); if (vma->vm_start > start) start = vma->vm_start; @@ -1564,7 +1573,7 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx, * len == 0 means wake all and we don't want to wake all here, * so check it again to be sure. */ - VM_BUG_ON(!range.len); + VM_WARN_ON_ONCE(!range.len); wake_userfault(ctx, &range); ret = 0; @@ -1621,7 +1630,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, return -EFAULT; if (ret < 0) goto out; - BUG_ON(!ret); + VM_WARN_ON_ONCE(!ret); /* len == 0 would wake all */ range.len = ret; if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) { @@ -1676,7 +1685,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, if (ret < 0) goto out; /* len == 0 would wake all */ - BUG_ON(!ret); + VM_WARN_ON_ONCE(!ret); range.len = ret; if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) { range.start = uffdio_zeropage.range.start; @@ -1788,7 +1797,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) goto out; /* len == 0 would wake all */ - BUG_ON(!ret); + VM_WARN_ON_ONCE(!ret); range.len = ret; if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) { range.start = uffdio_continue.range.start; @@ -1845,7 +1854,7 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long goto out; /* len == 0 would wake all */ - BUG_ON(!ret); + VM_WARN_ON_ONCE(!ret); range.len = ret; if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) { range.start = uffdio_poison.range.start; @@ -2106,12 +2115,10 @@ static int new_userfaultfd(int flags) struct file *file; int fd; - BUG_ON(!current->mm); + VM_WARN_ON_ONCE(!current->mm); /* Check the UFFD_* constants for consistency. */ BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS); - BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC); - BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK); if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY)) return -EINVAL; diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c index b492794f8e9a..4bebd947314a 100644 --- a/fs/vboxsf/file.c +++ b/fs/vboxsf/file.c @@ -165,13 +165,13 @@ static const struct vm_operations_struct vboxsf_file_vm_ops = { .map_pages = filemap_map_pages, }; -static int vboxsf_file_mmap(struct file *file, struct vm_area_struct *vma) +static int vboxsf_file_mmap_prepare(struct vm_area_desc *desc) { int err; - err = generic_file_mmap(file, vma); + err = generic_file_mmap_prepare(desc); if (!err) - vma->vm_ops = &vboxsf_file_vm_ops; + desc->vm_ops = &vboxsf_file_vm_ops; return err; } @@ -213,7 +213,7 @@ const struct file_operations vboxsf_reg_fops = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, - .mmap = vboxsf_file_mmap, + .mmap_prepare = vboxsf_file_mmap_prepare, .open = vboxsf_file_open, .release = vboxsf_file_release, .fsync = noop_fsync, @@ -300,12 +300,13 @@ static int vboxsf_writepages(struct address_space *mapping, return error; } -static int vboxsf_write_end(struct file *file, struct address_space *mapping, +static int vboxsf_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned int len, unsigned int copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; - struct vboxsf_handle *sf_handle = file->private_data; + struct vboxsf_handle *sf_handle = iocb->ki_filp->private_data; size_t from = offset_in_folio(folio, pos); u32 nwritten = len; u8 *buf; diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index 0bc96ab6580b..241647b060ee 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -189,7 +189,7 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_blocksize = 1024; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_op = &vboxsf_super_ops; - sb->s_d_op = &vboxsf_dentry_ops; + set_default_d_op(sb, &vboxsf_dentry_ops); iroot = iget_locked(sb, 0); if (!iroot) { diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig index 40569d3527a7..76d1c5971b82 100644 --- a/fs/verity/Kconfig +++ b/fs/verity/Kconfig @@ -2,11 +2,9 @@ config FS_VERITY bool "FS Verity (read-only file-based authenticity protection)" - select CRYPTO select CRYPTO_HASH_INFO - # SHA-256 is selected as it's intended to be the default hash algorithm. - # To avoid bloat, other wanted algorithms must be selected explicitly. - select CRYPTO_SHA256 + select CRYPTO_LIB_SHA256 + select CRYPTO_LIB_SHA512 help This option enables fs-verity. fs-verity is the dm-verity mechanism implemented at the file level. On supported diff --git a/fs/verity/enable.c b/fs/verity/enable.c index c284f46d1b53..503268cf4296 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -7,7 +7,7 @@ #include "fsverity_private.h" -#include <crypto/hash.h> +#include <linux/export.h> #include <linux/mount.h> #include <linux/sched/signal.h> #include <linux/uaccess.h> @@ -24,7 +24,6 @@ static int hash_one_block(struct inode *inode, struct block_buffer *cur) { struct block_buffer *next = cur + 1; - int err; /* * Safety check to prevent a buffer overflow in case of a filesystem bug @@ -37,10 +36,8 @@ static int hash_one_block(struct inode *inode, /* Zero-pad the block if it's shorter than the block size. */ memset(&cur->data[cur->filled], 0, params->block_size - cur->filled); - err = fsverity_hash_block(params, inode, cur->data, - &next->data[next->filled]); - if (err) - return err; + fsverity_hash_block(params, inode, cur->data, + &next->data[next->filled]); next->filled += params->digest_size; cur->filled = 0; return 0; diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index b3506f56e180..5fe854a5b9ad 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -20,7 +20,6 @@ /* A hash algorithm supported by fs-verity */ struct fsverity_hash_alg { - struct crypto_shash *tfm; /* hash tfm, allocated on demand */ const char *name; /* crypto API name, e.g. sha256 */ unsigned int digest_size; /* digest size in bytes, e.g. 32 for SHA-256 */ unsigned int block_size; /* block size in bytes, e.g. 64 for SHA-256 */ @@ -31,10 +30,16 @@ struct fsverity_hash_alg { enum hash_algo algo_id; }; +union fsverity_hash_ctx { + struct sha256_ctx sha256; + struct sha512_ctx sha512; +}; + /* Merkle tree parameters: hash algorithm, initial hash state, and topology */ struct merkle_tree_params { const struct fsverity_hash_alg *hash_alg; /* the hash algorithm */ - const u8 *hashstate; /* initial hash state or NULL */ + /* initial hash state if salted, NULL if unsalted */ + const union fsverity_hash_ctx *hashstate; unsigned int digest_size; /* same as hash_alg->digest_size */ unsigned int block_size; /* size of data and tree blocks */ unsigned int hashes_per_block; /* number of hashes per tree block */ @@ -76,16 +81,17 @@ struct fsverity_info { /* hash_algs.c */ -extern struct fsverity_hash_alg fsverity_hash_algs[]; +extern const struct fsverity_hash_alg fsverity_hash_algs[]; const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, unsigned int num); -const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, - const u8 *salt, size_t salt_size); -int fsverity_hash_block(const struct merkle_tree_params *params, - const struct inode *inode, const void *data, u8 *out); -int fsverity_hash_buffer(const struct fsverity_hash_alg *alg, - const void *data, size_t size, u8 *out); +union fsverity_hash_ctx * +fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, + const u8 *salt, size_t salt_size); +void fsverity_hash_block(const struct merkle_tree_params *params, + const struct inode *inode, const void *data, u8 *out); +void fsverity_hash_buffer(const struct fsverity_hash_alg *alg, + const void *data, size_t size, u8 *out); void __init fsverity_check_hash_algs(void); /* init.c */ diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c index 6b08b1d9a7d7..9bb3c6344907 100644 --- a/fs/verity/hash_algs.c +++ b/fs/verity/hash_algs.c @@ -7,10 +7,8 @@ #include "fsverity_private.h" -#include <crypto/hash.h> - /* The hash algorithms supported by fs-verity */ -struct fsverity_hash_alg fsverity_hash_algs[] = { +const struct fsverity_hash_alg fsverity_hash_algs[] = { [FS_VERITY_HASH_ALG_SHA256] = { .name = "sha256", .digest_size = SHA256_DIGEST_SIZE, @@ -25,106 +23,42 @@ struct fsverity_hash_alg fsverity_hash_algs[] = { }, }; -static DEFINE_MUTEX(fsverity_hash_alg_init_mutex); - /** - * fsverity_get_hash_alg() - validate and prepare a hash algorithm + * fsverity_get_hash_alg() - get a hash algorithm by number * @inode: optional inode for logging purposes * @num: the hash algorithm number * - * Get the struct fsverity_hash_alg for the given hash algorithm number, and - * ensure it has a hash transform ready to go. The hash transforms are - * allocated on-demand so that we don't waste resources unnecessarily, and - * because the crypto modules may be initialized later than fs/verity/. + * Get the struct fsverity_hash_alg for the given hash algorithm number. * - * Return: pointer to the hash alg on success, else an ERR_PTR() + * Return: pointer to the hash alg if it's known, otherwise NULL. */ const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, unsigned int num) { - struct fsverity_hash_alg *alg; - struct crypto_shash *tfm; - int err; - if (num >= ARRAY_SIZE(fsverity_hash_algs) || !fsverity_hash_algs[num].name) { fsverity_warn(inode, "Unknown hash algorithm number: %u", num); - return ERR_PTR(-EINVAL); - } - alg = &fsverity_hash_algs[num]; - - /* pairs with smp_store_release() below */ - if (likely(smp_load_acquire(&alg->tfm) != NULL)) - return alg; - - mutex_lock(&fsverity_hash_alg_init_mutex); - - if (alg->tfm != NULL) - goto out_unlock; - - tfm = crypto_alloc_shash(alg->name, 0, 0); - if (IS_ERR(tfm)) { - if (PTR_ERR(tfm) == -ENOENT) { - fsverity_warn(inode, - "Missing crypto API support for hash algorithm \"%s\"", - alg->name); - alg = ERR_PTR(-ENOPKG); - goto out_unlock; - } - fsverity_err(inode, - "Error allocating hash algorithm \"%s\": %ld", - alg->name, PTR_ERR(tfm)); - alg = ERR_CAST(tfm); - goto out_unlock; + return NULL; } - - err = -EINVAL; - if (WARN_ON_ONCE(alg->digest_size != crypto_shash_digestsize(tfm))) - goto err_free_tfm; - if (WARN_ON_ONCE(alg->block_size != crypto_shash_blocksize(tfm))) - goto err_free_tfm; - - pr_info("%s using implementation \"%s\"\n", - alg->name, crypto_shash_driver_name(tfm)); - - /* pairs with smp_load_acquire() above */ - smp_store_release(&alg->tfm, tfm); - goto out_unlock; - -err_free_tfm: - crypto_free_shash(tfm); - alg = ERR_PTR(err); -out_unlock: - mutex_unlock(&fsverity_hash_alg_init_mutex); - return alg; + return &fsverity_hash_algs[num]; } /** * fsverity_prepare_hash_state() - precompute the initial hash state * @alg: hash algorithm * @salt: a salt which is to be prepended to all data to be hashed - * @salt_size: salt size in bytes, possibly 0 + * @salt_size: salt size in bytes * - * Return: NULL if the salt is empty, otherwise the kmalloc()'ed precomputed - * initial hash state on success or an ERR_PTR() on failure. + * Return: the kmalloc()'ed initial hash state, or NULL if out of memory. */ -const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, - const u8 *salt, size_t salt_size) +union fsverity_hash_ctx * +fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, + const u8 *salt, size_t salt_size) { - u8 *hashstate = NULL; - SHASH_DESC_ON_STACK(desc, alg->tfm); u8 *padded_salt = NULL; size_t padded_salt_size; - int err; - - desc->tfm = alg->tfm; - - if (salt_size == 0) - return NULL; - - hashstate = kmalloc(crypto_shash_statesize(alg->tfm), GFP_KERNEL); - if (!hashstate) - return ERR_PTR(-ENOMEM); + union fsverity_hash_ctx ctx; + void *res = NULL; /* * Zero-pad the salt to the next multiple of the input size of the hash @@ -135,30 +69,26 @@ const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, */ padded_salt_size = round_up(salt_size, alg->block_size); padded_salt = kzalloc(padded_salt_size, GFP_KERNEL); - if (!padded_salt) { - err = -ENOMEM; - goto err_free; - } + if (!padded_salt) + return NULL; memcpy(padded_salt, salt, salt_size); - err = crypto_shash_init(desc); - if (err) - goto err_free; - - err = crypto_shash_update(desc, padded_salt, padded_salt_size); - if (err) - goto err_free; - - err = crypto_shash_export(desc, hashstate); - if (err) - goto err_free; -out: - kfree(padded_salt); - return hashstate; -err_free: - kfree(hashstate); - hashstate = ERR_PTR(err); - goto out; + switch (alg->algo_id) { + case HASH_ALGO_SHA256: + sha256_init(&ctx.sha256); + sha256_update(&ctx.sha256, padded_salt, padded_salt_size); + res = kmemdup(&ctx.sha256, sizeof(ctx.sha256), GFP_KERNEL); + break; + case HASH_ALGO_SHA512: + sha512_init(&ctx.sha512); + sha512_update(&ctx.sha512, padded_salt, padded_salt_size); + res = kmemdup(&ctx.sha512, sizeof(ctx.sha512), GFP_KERNEL); + break; + default: + WARN_ON_ONCE(1); + } + kfree(padded_salt); + return res; } /** @@ -170,31 +100,32 @@ err_free: * * Hash a single data or hash block. The hash is salted if a salt is specified * in the Merkle tree parameters. - * - * Return: 0 on success, -errno on failure */ -int fsverity_hash_block(const struct merkle_tree_params *params, - const struct inode *inode, const void *data, u8 *out) +void fsverity_hash_block(const struct merkle_tree_params *params, + const struct inode *inode, const void *data, u8 *out) { - SHASH_DESC_ON_STACK(desc, params->hash_alg->tfm); - int err; - - desc->tfm = params->hash_alg->tfm; - - if (params->hashstate) { - err = crypto_shash_import(desc, params->hashstate); - if (err) { - fsverity_err(inode, - "Error %d importing hash state", err); - return err; - } - err = crypto_shash_finup(desc, data, params->block_size, out); - } else { - err = crypto_shash_digest(desc, data, params->block_size, out); + union fsverity_hash_ctx ctx; + + if (!params->hashstate) { + fsverity_hash_buffer(params->hash_alg, data, params->block_size, + out); + return; + } + + switch (params->hash_alg->algo_id) { + case HASH_ALGO_SHA256: + ctx.sha256 = params->hashstate->sha256; + sha256_update(&ctx.sha256, data, params->block_size); + sha256_final(&ctx.sha256, out); + return; + case HASH_ALGO_SHA512: + ctx.sha512 = params->hashstate->sha512; + sha512_update(&ctx.sha512, data, params->block_size); + sha512_final(&ctx.sha512, out); + return; + default: + BUG(); } - if (err) - fsverity_err(inode, "Error %d computing block hash", err); - return err; } /** @@ -203,13 +134,20 @@ int fsverity_hash_block(const struct merkle_tree_params *params, * @data: the data to hash * @size: size of data to hash, in bytes * @out: output digest, size 'alg->digest_size' bytes - * - * Return: 0 on success, -errno on failure */ -int fsverity_hash_buffer(const struct fsverity_hash_alg *alg, - const void *data, size_t size, u8 *out) +void fsverity_hash_buffer(const struct fsverity_hash_alg *alg, + const void *data, size_t size, u8 *out) { - return crypto_shash_tfm_digest(alg->tfm, data, size, out); + switch (alg->algo_id) { + case HASH_ALGO_SHA256: + sha256(data, size, out); + return; + case HASH_ALGO_SHA512: + sha512(data, size, out); + return; + default: + BUG(); + } } void __init fsverity_check_hash_algs(void) diff --git a/fs/verity/measure.c b/fs/verity/measure.c index 175d2f1bc089..388734132f01 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -9,6 +9,7 @@ #include <linux/bpf.h> #include <linux/btf.h> +#include <linux/export.h> #include <linux/uaccess.h> /** diff --git a/fs/verity/open.c b/fs/verity/open.c index fdeb95eca3af..c561e130cd0c 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -7,6 +7,7 @@ #include "fsverity_private.h" +#include <linux/export.h> #include <linux/mm.h> #include <linux/slab.h> @@ -42,18 +43,18 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, memset(params, 0, sizeof(*params)); hash_alg = fsverity_get_hash_alg(inode, hash_algorithm); - if (IS_ERR(hash_alg)) - return PTR_ERR(hash_alg); + if (!hash_alg) + return -EINVAL; params->hash_alg = hash_alg; params->digest_size = hash_alg->digest_size; - params->hashstate = fsverity_prepare_hash_state(hash_alg, salt, - salt_size); - if (IS_ERR(params->hashstate)) { - err = PTR_ERR(params->hashstate); - params->hashstate = NULL; - fsverity_err(inode, "Error %d preparing hash state", err); - goto out_err; + if (salt_size) { + params->hashstate = + fsverity_prepare_hash_state(hash_alg, salt, salt_size); + if (!params->hashstate) { + err = -ENOMEM; + goto out_err; + } } /* @@ -158,18 +159,15 @@ out_err: * Compute the file digest by hashing the fsverity_descriptor excluding the * builtin signature and with the sig_size field set to 0. */ -static int compute_file_digest(const struct fsverity_hash_alg *hash_alg, - struct fsverity_descriptor *desc, - u8 *file_digest) +static void compute_file_digest(const struct fsverity_hash_alg *hash_alg, + struct fsverity_descriptor *desc, + u8 *file_digest) { __le32 sig_size = desc->sig_size; - int err; desc->sig_size = 0; - err = fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), file_digest); + fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), file_digest); desc->sig_size = sig_size; - - return err; } /* @@ -201,12 +199,7 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode, memcpy(vi->root_hash, desc->root_hash, vi->tree_params.digest_size); - err = compute_file_digest(vi->tree_params.hash_alg, desc, - vi->file_digest); - if (err) { - fsverity_err(inode, "Error %d computing file digest", err); - goto fail; - } + compute_file_digest(vi->tree_params.hash_alg, desc, vi->file_digest); err = fsverity_verify_signature(vi, desc->signature, le32_to_cpu(desc->sig_size)); diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c index f58432772d9e..cba5d6af4e04 100644 --- a/fs/verity/read_metadata.c +++ b/fs/verity/read_metadata.c @@ -8,6 +8,7 @@ #include "fsverity_private.h" #include <linux/backing-dev.h> +#include <linux/export.h> #include <linux/highmem.h> #include <linux/sched/signal.h> #include <linux/uaccess.h> diff --git a/fs/verity/verify.c b/fs/verity/verify.c index 4fcad0825a12..a1f00c3fd3b2 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -7,8 +7,8 @@ #include "fsverity_private.h" -#include <crypto/hash.h> #include <linux/bio.h> +#include <linux/export.h> static struct workqueue_struct *fsverity_read_workqueue; @@ -202,8 +202,7 @@ descend: unsigned long hblock_idx = hblocks[level - 1].index; unsigned int hoffset = hblocks[level - 1].hoffset; - if (fsverity_hash_block(params, inode, haddr, real_hash) != 0) - goto error; + fsverity_hash_block(params, inode, haddr, real_hash); if (memcmp(want_hash, real_hash, hsize) != 0) goto corrupted; /* @@ -222,8 +221,7 @@ descend: } /* Finally, verify the data block. */ - if (fsverity_hash_block(params, inode, data, real_hash) != 0) - goto error; + fsverity_hash_block(params, inode, data, real_hash); if (memcmp(want_hash, real_hash, hsize) != 0) goto corrupted; return true; diff --git a/fs/xattr.c b/fs/xattr.c index 8ec5b0204bfd..8851a5ef34f5 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -215,7 +215,7 @@ EXPORT_SYMBOL(__vfs_setxattr); * * returns the result of the internal setxattr or setsecurity operations. * - * This function requires the caller to lock the inode's i_mutex before it + * This function requires the caller to lock the inode's i_rwsem before it * is executed. It also assumes that the caller will make the appropriate * permission checks. */ @@ -1479,6 +1479,7 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, buffer += err; } remaining_size -= err; + err = 0; read_lock(&xattrs->lock); for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) { diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 7839efe050bf..000cc7f4a3ce 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -3444,16 +3444,41 @@ xfs_alloc_read_agf( set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); } + #ifdef DEBUG - else if (!xfs_is_shutdown(mp)) { - ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); - ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); - ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); - ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest)); - ASSERT(pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level)); - ASSERT(pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level)); + /* + * It's possible for the AGF to be out of sync if the block device is + * silently dropping writes. This can happen in fstests with dmflakey + * enabled, which allows the buffer to be cleaned and reclaimed by + * memory pressure and then re-read from disk here. We will get a + * stale version of the AGF from disk, and nothing good can happen from + * here. Hence if we detect this situation, immediately shut down the + * filesystem. + * + * This can also happen if we are already in the middle of a forced + * shutdown, so don't bother checking if we are already shut down. + */ + if (!xfs_is_shutdown(pag_mount(pag))) { + bool ok = true; + + ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks); + ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks); + ok &= pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks); + ok &= pag->pagf_flcount == be32_to_cpu(agf->agf_flcount); + ok &= pag->pagf_longest == be32_to_cpu(agf->agf_longest); + ok &= pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level); + ok &= pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level); + + if (XFS_IS_CORRUPT(pag_mount(pag), !ok)) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF); + xfs_trans_brelse(tp, agfbp); + xfs_force_shutdown(pag_mount(pag), + SHUTDOWN_CORRUPT_ONDISK); + return -EFSCORRUPTED; + } } -#endif +#endif /* DEBUG */ + if (agfbpp) *agfbpp = agfbp; else diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index a4ac37ba5d51..fa1f03c1331e 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -186,35 +186,32 @@ xfs_allocbt_init_ptr_from_cur( ptr->s = agf->agf_cnt_root; } -STATIC int64_t -xfs_bnobt_key_diff( +STATIC int +xfs_bnobt_cmp_key_with_cur( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { struct xfs_alloc_rec_incore *rec = &cur->bc_rec.a; const struct xfs_alloc_rec *kp = &key->alloc; - return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; + return cmp_int(be32_to_cpu(kp->ar_startblock), + rec->ar_startblock); } -STATIC int64_t -xfs_cntbt_key_diff( +STATIC int +xfs_cntbt_cmp_key_with_cur( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { struct xfs_alloc_rec_incore *rec = &cur->bc_rec.a; const struct xfs_alloc_rec *kp = &key->alloc; - int64_t diff; - diff = (int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount; - if (diff) - return diff; - - return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; + return cmp_int(be32_to_cpu(kp->ar_blockcount), rec->ar_blockcount) ?: + cmp_int(be32_to_cpu(kp->ar_startblock), rec->ar_startblock); } -STATIC int64_t -xfs_bnobt_diff_two_keys( +STATIC int +xfs_bnobt_cmp_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2, @@ -222,29 +219,24 @@ xfs_bnobt_diff_two_keys( { ASSERT(!mask || mask->alloc.ar_startblock); - return (int64_t)be32_to_cpu(k1->alloc.ar_startblock) - - be32_to_cpu(k2->alloc.ar_startblock); + return cmp_int(be32_to_cpu(k1->alloc.ar_startblock), + be32_to_cpu(k2->alloc.ar_startblock)); } -STATIC int64_t -xfs_cntbt_diff_two_keys( +STATIC int +xfs_cntbt_cmp_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2, const union xfs_btree_key *mask) { - int64_t diff; - ASSERT(!mask || (mask->alloc.ar_blockcount && mask->alloc.ar_startblock)); - diff = be32_to_cpu(k1->alloc.ar_blockcount) - - be32_to_cpu(k2->alloc.ar_blockcount); - if (diff) - return diff; - - return be32_to_cpu(k1->alloc.ar_startblock) - - be32_to_cpu(k2->alloc.ar_startblock); + return cmp_int(be32_to_cpu(k1->alloc.ar_blockcount), + be32_to_cpu(k2->alloc.ar_blockcount)) ?: + cmp_int(be32_to_cpu(k1->alloc.ar_startblock), + be32_to_cpu(k2->alloc.ar_startblock)); } static xfs_failaddr_t @@ -438,9 +430,9 @@ const struct xfs_btree_ops xfs_bnobt_ops = { .init_high_key_from_rec = xfs_bnobt_init_high_key_from_rec, .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, - .key_diff = xfs_bnobt_key_diff, + .cmp_key_with_cur = xfs_bnobt_cmp_key_with_cur, .buf_ops = &xfs_bnobt_buf_ops, - .diff_two_keys = xfs_bnobt_diff_two_keys, + .cmp_two_keys = xfs_bnobt_cmp_two_keys, .keys_inorder = xfs_bnobt_keys_inorder, .recs_inorder = xfs_bnobt_recs_inorder, .keys_contiguous = xfs_allocbt_keys_contiguous, @@ -468,9 +460,9 @@ const struct xfs_btree_ops xfs_cntbt_ops = { .init_high_key_from_rec = xfs_cntbt_init_high_key_from_rec, .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, - .key_diff = xfs_cntbt_key_diff, + .cmp_key_with_cur = xfs_cntbt_cmp_key_with_cur, .buf_ops = &xfs_cntbt_buf_ops, - .diff_two_keys = xfs_cntbt_diff_two_keys, + .cmp_two_keys = xfs_cntbt_cmp_two_keys, .keys_inorder = xfs_cntbt_keys_inorder, .recs_inorder = xfs_cntbt_recs_inorder, .keys_contiguous = NULL, /* not needed right now */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 908d7b050e9c..188feac04b60 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -369,38 +369,26 @@ xfs_bmbt_init_rec_from_cur( xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b); } -STATIC int64_t -xfs_bmbt_key_diff( +STATIC int +xfs_bmbt_cmp_key_with_cur( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { - return (int64_t)be64_to_cpu(key->bmbt.br_startoff) - - cur->bc_rec.b.br_startoff; + return cmp_int(be64_to_cpu(key->bmbt.br_startoff), + cur->bc_rec.b.br_startoff); } -STATIC int64_t -xfs_bmbt_diff_two_keys( +STATIC int +xfs_bmbt_cmp_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2, const union xfs_btree_key *mask) { - uint64_t a = be64_to_cpu(k1->bmbt.br_startoff); - uint64_t b = be64_to_cpu(k2->bmbt.br_startoff); - ASSERT(!mask || mask->bmbt.br_startoff); - /* - * Note: This routine previously casted a and b to int64 and subtracted - * them to generate a result. This lead to problems if b was the - * "maximum" key value (all ones) being signed incorrectly, hence this - * somewhat less efficient version. - */ - if (a > b) - return 1; - if (b > a) - return -1; - return 0; + return cmp_int(be64_to_cpu(k1->bmbt.br_startoff), + be64_to_cpu(k2->bmbt.br_startoff)); } static xfs_failaddr_t @@ -647,8 +635,8 @@ const struct xfs_btree_ops xfs_bmbt_ops = { .init_key_from_rec = xfs_bmbt_init_key_from_rec, .init_high_key_from_rec = xfs_bmbt_init_high_key_from_rec, .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, - .key_diff = xfs_bmbt_key_diff, - .diff_two_keys = xfs_bmbt_diff_two_keys, + .cmp_key_with_cur = xfs_bmbt_cmp_key_with_cur, + .cmp_two_keys = xfs_bmbt_cmp_two_keys, .buf_ops = &xfs_bmbt_buf_ops, .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 299ce7fd11b0..a61211d253f1 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1985,7 +1985,7 @@ xfs_btree_lookup( int *stat) /* success/failure */ { struct xfs_btree_block *block; /* current btree block */ - int64_t diff; /* difference for the current key */ + int cmp_r; /* current key comparison result */ int error; /* error return value */ int keyno; /* current key number */ int level; /* level in the btree */ @@ -2013,13 +2013,13 @@ xfs_btree_lookup( * on the lookup record, then follow the corresponding block * pointer down to the next level. */ - for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { + for (level = cur->bc_nlevels - 1, cmp_r = 1; level >= 0; level--) { /* Get the block we need to do the lookup on. */ error = xfs_btree_lookup_get_block(cur, level, pp, &block); if (error) goto error0; - if (diff == 0) { + if (cmp_r == 0) { /* * If we already had a key match at a higher level, we * know we need to use the first entry in this block. @@ -2065,15 +2065,16 @@ xfs_btree_lookup( keyno, block, &key); /* - * Compute difference to get next direction: + * Compute comparison result to get next + * direction: * - less than, move right * - greater than, move left * - equal, we're done */ - diff = cur->bc_ops->key_diff(cur, kp); - if (diff < 0) + cmp_r = cur->bc_ops->cmp_key_with_cur(cur, kp); + if (cmp_r < 0) low = keyno + 1; - else if (diff > 0) + else if (cmp_r > 0) high = keyno - 1; else break; @@ -2089,7 +2090,7 @@ xfs_btree_lookup( * If we moved left, need the previous key number, * unless there isn't one. */ - if (diff > 0 && --keyno < 1) + if (cmp_r > 0 && --keyno < 1) keyno = 1; pp = xfs_btree_ptr_addr(cur, keyno, block); @@ -2102,7 +2103,7 @@ xfs_btree_lookup( } /* Done with the search. See if we need to adjust the results. */ - if (dir != XFS_LOOKUP_LE && diff < 0) { + if (dir != XFS_LOOKUP_LE && cmp_r < 0) { keyno++; /* * If ge search and we went off the end of the block, but it's @@ -2125,14 +2126,14 @@ xfs_btree_lookup( *stat = 1; return 0; } - } else if (dir == XFS_LOOKUP_LE && diff > 0) + } else if (dir == XFS_LOOKUP_LE && cmp_r > 0) keyno--; cur->bc_levels[0].ptr = keyno; /* Return if we succeeded or not. */ if (keyno == 0 || keyno > xfs_btree_get_numrecs(block)) *stat = 0; - else if (dir != XFS_LOOKUP_EQ || diff == 0) + else if (dir != XFS_LOOKUP_EQ || cmp_r == 0) *stat = 1; else *stat = 0; @@ -5058,7 +5059,7 @@ xfs_btree_simple_query_range( int error; ASSERT(cur->bc_ops->init_high_key_from_rec); - ASSERT(cur->bc_ops->diff_two_keys); + ASSERT(cur->bc_ops->cmp_two_keys); /* * Find the leftmost record. The btree cursor must be set @@ -5352,15 +5353,15 @@ xfs_btree_count_blocks( } /* Compare two btree pointers. */ -int64_t -xfs_btree_diff_two_ptrs( +int +xfs_btree_cmp_two_ptrs( struct xfs_btree_cur *cur, const union xfs_btree_ptr *a, const union xfs_btree_ptr *b) { if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) - return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l); - return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s); + return cmp_int(be64_to_cpu(a->l), be64_to_cpu(b->l)); + return cmp_int(be32_to_cpu(a->s), be32_to_cpu(b->s)); } struct xfs_btree_has_records { diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 355b304696e6..60e78572e725 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -171,20 +171,23 @@ struct xfs_btree_ops { void (*init_high_key_from_rec)(union xfs_btree_key *key, const union xfs_btree_rec *rec); - /* difference between key value and cursor value */ - int64_t (*key_diff)(struct xfs_btree_cur *cur, - const union xfs_btree_key *key); + /* + * Compare key value and cursor value -- positive if key > cur, + * negative if key < cur, and zero if equal. + */ + int (*cmp_key_with_cur)(struct xfs_btree_cur *cur, + const union xfs_btree_key *key); /* - * Difference between key2 and key1 -- positive if key1 > key2, - * negative if key1 < key2, and zero if equal. If the @mask parameter - * is non NULL, each key field to be used in the comparison must - * contain a nonzero value. + * Compare key1 and key2 -- positive if key1 > key2, negative if + * key1 < key2, and zero if equal. If the @mask parameter is non NULL, + * each key field to be used in the comparison must contain a nonzero + * value. */ - int64_t (*diff_two_keys)(struct xfs_btree_cur *cur, - const union xfs_btree_key *key1, - const union xfs_btree_key *key2, - const union xfs_btree_key *mask); + int (*cmp_two_keys)(struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask); const struct xfs_buf_ops *buf_ops; @@ -516,9 +519,9 @@ struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur, int level, struct xfs_buf **bpp); bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr); -int64_t xfs_btree_diff_two_ptrs(struct xfs_btree_cur *cur, - const union xfs_btree_ptr *a, - const union xfs_btree_ptr *b); +int xfs_btree_cmp_two_ptrs(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *a, + const union xfs_btree_ptr *b); void xfs_btree_get_sibling(struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_ptr *ptr, int lr); @@ -546,7 +549,7 @@ xfs_btree_keycmp_lt( const union xfs_btree_key *key1, const union xfs_btree_key *key2) { - return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) < 0; + return cur->bc_ops->cmp_two_keys(cur, key1, key2, NULL) < 0; } static inline bool @@ -555,7 +558,7 @@ xfs_btree_keycmp_gt( const union xfs_btree_key *key1, const union xfs_btree_key *key2) { - return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) > 0; + return cur->bc_ops->cmp_two_keys(cur, key1, key2, NULL) > 0; } static inline bool @@ -564,7 +567,7 @@ xfs_btree_keycmp_eq( const union xfs_btree_key *key1, const union xfs_btree_key *key2) { - return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) == 0; + return cur->bc_ops->cmp_two_keys(cur, key1, key2, NULL) == 0; } static inline bool @@ -602,7 +605,7 @@ xfs_btree_masked_keycmp_lt( const union xfs_btree_key *key2, const union xfs_btree_key *mask) { - return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) < 0; + return cur->bc_ops->cmp_two_keys(cur, key1, key2, mask) < 0; } static inline bool @@ -612,7 +615,7 @@ xfs_btree_masked_keycmp_gt( const union xfs_btree_key *key2, const union xfs_btree_key *mask) { - return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) > 0; + return cur->bc_ops->cmp_two_keys(cur, key1, key2, mask) > 0; } static inline bool diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 9566a7623365..779dac59b1f3 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -112,7 +112,7 @@ typedef struct xfs_sb { uint16_t sb_sectsize; /* volume sector size, bytes */ uint16_t sb_inodesize; /* inode size, bytes */ uint16_t sb_inopblock; /* inodes per block */ - char sb_fname[XFSLABEL_MAX]; /* file system name */ + char sb_fname[XFSLABEL_MAX] __nonstring; /* file system name */ uint8_t sb_blocklog; /* log2 of sb_blocksize */ uint8_t sb_sectlog; /* log2 of sb_sectsize */ uint8_t sb_inodelog; /* log2 of sb_inodesize */ diff --git a/fs/xfs/libxfs/xfs_group.c b/fs/xfs/libxfs/xfs_group.c index e9d76bcdc820..792f76d2e2a0 100644 --- a/fs/xfs/libxfs/xfs_group.c +++ b/fs/xfs/libxfs/xfs_group.c @@ -163,7 +163,8 @@ xfs_group_free( xfs_defer_drain_free(&xg->xg_intents_drain); #ifdef __KERNEL__ - kfree(xg->xg_busy_extents); + if (xfs_group_has_extent_busy(xg->xg_mount, xg->xg_type)) + kfree(xg->xg_busy_extents); #endif if (uninit) @@ -171,7 +172,8 @@ xfs_group_free( /* drop the mount's active reference */ xfs_group_rele(xg); - XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_active_ref) != 0); + XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_active_ref) > 0); + XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_active_ref) < 0); kfree_rcu_mightsleep(xg); } @@ -189,9 +191,11 @@ xfs_group_insert( xg->xg_type = type; #ifdef __KERNEL__ - xg->xg_busy_extents = xfs_extent_busy_alloc(); - if (!xg->xg_busy_extents) - return -ENOMEM; + if (xfs_group_has_extent_busy(mp, type)) { + xg->xg_busy_extents = xfs_extent_busy_alloc(); + if (!xg->xg_busy_extents) + return -ENOMEM; + } spin_lock_init(&xg->xg_state_lock); xfs_hooks_init(&xg->xg_rmap_update_hooks); #endif @@ -210,7 +214,8 @@ xfs_group_insert( out_drain: xfs_defer_drain_free(&xg->xg_intents_drain); #ifdef __KERNEL__ - kfree(xg->xg_busy_extents); + if (xfs_group_has_extent_busy(xg->xg_mount, xg->xg_type)) + kfree(xg->xg_busy_extents); #endif return error; } diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 0c47b5c6ca7d..750111634d9f 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2801,12 +2801,35 @@ xfs_ialloc_read_agi( set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); } +#ifdef DEBUG /* - * It's possible for these to be out of sync if - * we are in the middle of a forced shutdown. + * It's possible for the AGF to be out of sync if the block device is + * silently dropping writes. This can happen in fstests with dmflakey + * enabled, which allows the buffer to be cleaned and reclaimed by + * memory pressure and then re-read from disk here. We will get a + * stale version of the AGF from disk, and nothing good can happen from + * here. Hence if we detect this situation, immediately shut down the + * filesystem. + * + * This can also happen if we are already in the middle of a forced + * shutdown, so don't bother checking if we are already shut down. */ - ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || - xfs_is_shutdown(pag_mount(pag))); + if (!xfs_is_shutdown(pag_mount(pag))) { + bool ok = true; + + ok &= pag->pagi_freecount == be32_to_cpu(agi->agi_freecount); + ok &= pag->pagi_count == be32_to_cpu(agi->agi_count); + + if (XFS_IS_CORRUPT(pag_mount(pag), !ok)) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); + xfs_trans_brelse(tp, agibp); + xfs_force_shutdown(pag_mount(pag), + SHUTDOWN_CORRUPT_ONDISK); + return -EFSCORRUPTED; + } + } +#endif /* DEBUG */ + if (agibpp) *agibpp = agibp; else diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 6f270d8f4270..100afdd66cdd 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -265,17 +265,17 @@ xfs_finobt_init_ptr_from_cur( ptr->s = agi->agi_free_root; } -STATIC int64_t -xfs_inobt_key_diff( +STATIC int +xfs_inobt_cmp_key_with_cur( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { - return (int64_t)be32_to_cpu(key->inobt.ir_startino) - - cur->bc_rec.i.ir_startino; + return cmp_int(be32_to_cpu(key->inobt.ir_startino), + cur->bc_rec.i.ir_startino); } -STATIC int64_t -xfs_inobt_diff_two_keys( +STATIC int +xfs_inobt_cmp_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2, @@ -283,8 +283,8 @@ xfs_inobt_diff_two_keys( { ASSERT(!mask || mask->inobt.ir_startino); - return (int64_t)be32_to_cpu(k1->inobt.ir_startino) - - be32_to_cpu(k2->inobt.ir_startino); + return cmp_int(be32_to_cpu(k1->inobt.ir_startino), + be32_to_cpu(k2->inobt.ir_startino)); } static xfs_failaddr_t @@ -430,9 +430,9 @@ const struct xfs_btree_ops xfs_inobt_ops = { .init_high_key_from_rec = xfs_inobt_init_high_key_from_rec, .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, - .key_diff = xfs_inobt_key_diff, + .cmp_key_with_cur = xfs_inobt_cmp_key_with_cur, .buf_ops = &xfs_inobt_buf_ops, - .diff_two_keys = xfs_inobt_diff_two_keys, + .cmp_two_keys = xfs_inobt_cmp_two_keys, .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, .keys_contiguous = xfs_inobt_keys_contiguous, @@ -460,9 +460,9 @@ const struct xfs_btree_ops xfs_finobt_ops = { .init_high_key_from_rec = xfs_inobt_init_high_key_from_rec, .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur, - .key_diff = xfs_inobt_key_diff, + .cmp_key_with_cur = xfs_inobt_cmp_key_with_cur, .buf_ops = &xfs_finobt_buf_ops, - .diff_two_keys = xfs_inobt_diff_two_keys, + .cmp_two_keys = xfs_inobt_cmp_two_keys, .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, .keys_contiguous = xfs_inobt_keys_contiguous, diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 66c7916fb5cd..95de23095030 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -104,7 +104,7 @@ struct xlog_recover_item { struct list_head ri_list; int ri_cnt; /* count of regions found */ int ri_total; /* total regions */ - struct xfs_log_iovec *ri_buf; /* ptr to regions buffer */ + struct kvec *ri_buf; /* ptr to regions buffer */ const struct xlog_recover_item_ops *ri_ops; }; @@ -117,7 +117,7 @@ struct xlog_recover { struct list_head r_itemq; /* q for items */ }; -#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr) +#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].iov_base) #define XLOG_RECOVER_CRCPASS 0 #define XLOG_RECOVER_PASS1 1 diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index cebe83f7842a..897784037483 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -2099,9 +2099,7 @@ xfs_refcount_recover_cow_leftovers( * recording the CoW debris we cancel the (empty) transaction * and everything goes away cleanly. */ - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - return error; + tp = xfs_trans_alloc_empty(mp); if (isrt) { xfs_rtgroup_lock(to_rtg(xg), XFS_RTGLOCK_REFCOUNT); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 54505fee1852..06da3ca14727 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -174,8 +174,8 @@ xfs_refcountbt_init_ptr_from_cur( ptr->s = agf->agf_refcount_root; } -STATIC int64_t -xfs_refcountbt_key_diff( +STATIC int +xfs_refcountbt_cmp_key_with_cur( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { @@ -185,11 +185,11 @@ xfs_refcountbt_key_diff( start = xfs_refcount_encode_startblock(irec->rc_startblock, irec->rc_domain); - return (int64_t)be32_to_cpu(kp->rc_startblock) - start; + return cmp_int(be32_to_cpu(kp->rc_startblock), start); } -STATIC int64_t -xfs_refcountbt_diff_two_keys( +STATIC int +xfs_refcountbt_cmp_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2, @@ -197,8 +197,8 @@ xfs_refcountbt_diff_two_keys( { ASSERT(!mask || mask->refc.rc_startblock); - return (int64_t)be32_to_cpu(k1->refc.rc_startblock) - - be32_to_cpu(k2->refc.rc_startblock); + return cmp_int(be32_to_cpu(k1->refc.rc_startblock), + be32_to_cpu(k2->refc.rc_startblock)); } STATIC xfs_failaddr_t @@ -339,9 +339,9 @@ const struct xfs_btree_ops xfs_refcountbt_ops = { .init_high_key_from_rec = xfs_refcountbt_init_high_key_from_rec, .init_rec_from_cur = xfs_refcountbt_init_rec_from_cur, .init_ptr_from_cur = xfs_refcountbt_init_ptr_from_cur, - .key_diff = xfs_refcountbt_key_diff, + .cmp_key_with_cur = xfs_refcountbt_cmp_key_with_cur, .buf_ops = &xfs_refcountbt_buf_ops, - .diff_two_keys = xfs_refcountbt_diff_two_keys, + .cmp_two_keys = xfs_refcountbt_cmp_two_keys, .keys_inorder = xfs_refcountbt_keys_inorder, .recs_inorder = xfs_refcountbt_recs_inorder, .keys_contiguous = xfs_refcountbt_keys_contiguous, diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 2cab694ac58a..bf16aee50d73 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -243,38 +243,22 @@ static inline uint64_t offset_keymask(uint64_t offset) return offset & ~XFS_RMAP_OFF_UNWRITTEN; } -STATIC int64_t -xfs_rmapbt_key_diff( +STATIC int +xfs_rmapbt_cmp_key_with_cur( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { struct xfs_rmap_irec *rec = &cur->bc_rec.r; const struct xfs_rmap_key *kp = &key->rmap; - __u64 x, y; - int64_t d; - d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; - if (d) - return d; - - x = be64_to_cpu(kp->rm_owner); - y = rec->rm_owner; - if (x > y) - return 1; - else if (y > x) - return -1; - - x = offset_keymask(be64_to_cpu(kp->rm_offset)); - y = offset_keymask(xfs_rmap_irec_offset_pack(rec)); - if (x > y) - return 1; - else if (y > x) - return -1; - return 0; + return cmp_int(be32_to_cpu(kp->rm_startblock), rec->rm_startblock) ?: + cmp_int(be64_to_cpu(kp->rm_owner), rec->rm_owner) ?: + cmp_int(offset_keymask(be64_to_cpu(kp->rm_offset)), + offset_keymask(xfs_rmap_irec_offset_pack(rec))); } -STATIC int64_t -xfs_rmapbt_diff_two_keys( +STATIC int +xfs_rmapbt_cmp_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2, @@ -282,36 +266,31 @@ xfs_rmapbt_diff_two_keys( { const struct xfs_rmap_key *kp1 = &k1->rmap; const struct xfs_rmap_key *kp2 = &k2->rmap; - int64_t d; - __u64 x, y; + int d; /* Doesn't make sense to mask off the physical space part */ ASSERT(!mask || mask->rmap.rm_startblock); - d = (int64_t)be32_to_cpu(kp1->rm_startblock) - - be32_to_cpu(kp2->rm_startblock); + d = cmp_int(be32_to_cpu(kp1->rm_startblock), + be32_to_cpu(kp2->rm_startblock)); if (d) return d; if (!mask || mask->rmap.rm_owner) { - x = be64_to_cpu(kp1->rm_owner); - y = be64_to_cpu(kp2->rm_owner); - if (x > y) - return 1; - else if (y > x) - return -1; + d = cmp_int(be64_to_cpu(kp1->rm_owner), + be64_to_cpu(kp2->rm_owner)); + if (d) + return d; } if (!mask || mask->rmap.rm_offset) { /* Doesn't make sense to allow offset but not owner */ ASSERT(!mask || mask->rmap.rm_owner); - x = offset_keymask(be64_to_cpu(kp1->rm_offset)); - y = offset_keymask(be64_to_cpu(kp2->rm_offset)); - if (x > y) - return 1; - else if (y > x) - return -1; + d = cmp_int(offset_keymask(be64_to_cpu(kp1->rm_offset)), + offset_keymask(be64_to_cpu(kp2->rm_offset))); + if (d) + return d; } return 0; @@ -515,9 +494,9 @@ const struct xfs_btree_ops xfs_rmapbt_ops = { .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec, .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur, .init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur, - .key_diff = xfs_rmapbt_key_diff, + .cmp_key_with_cur = xfs_rmapbt_cmp_key_with_cur, .buf_ops = &xfs_rmapbt_buf_ops, - .diff_two_keys = xfs_rmapbt_diff_two_keys, + .cmp_two_keys = xfs_rmapbt_cmp_two_keys, .keys_inorder = xfs_rmapbt_keys_inorder, .recs_inorder = xfs_rmapbt_recs_inorder, .keys_contiguous = xfs_rmapbt_keys_contiguous, @@ -632,9 +611,9 @@ const struct xfs_btree_ops xfs_rmapbt_mem_ops = { .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec, .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur, .init_ptr_from_cur = xfbtree_init_ptr_from_cur, - .key_diff = xfs_rmapbt_key_diff, + .cmp_key_with_cur = xfs_rmapbt_cmp_key_with_cur, .buf_ops = &xfs_rmapbt_mem_buf_ops, - .diff_two_keys = xfs_rmapbt_diff_two_keys, + .cmp_two_keys = xfs_rmapbt_cmp_two_keys, .keys_inorder = xfs_rmapbt_keys_inorder, .recs_inorder = xfs_rmapbt_recs_inorder, .keys_contiguous = xfs_rmapbt_keys_contiguous, diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c index 3db5e7a4a945..ac11e94b42ae 100644 --- a/fs/xfs/libxfs/xfs_rtrefcount_btree.c +++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c @@ -156,8 +156,8 @@ xfs_rtrefcountbt_init_ptr_from_cur( ptr->l = 0; } -STATIC int64_t -xfs_rtrefcountbt_key_diff( +STATIC int +xfs_rtrefcountbt_cmp_key_with_cur( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { @@ -167,11 +167,11 @@ xfs_rtrefcountbt_key_diff( start = xfs_refcount_encode_startblock(irec->rc_startblock, irec->rc_domain); - return (int64_t)be32_to_cpu(kp->rc_startblock) - start; + return cmp_int(be32_to_cpu(kp->rc_startblock), start); } -STATIC int64_t -xfs_rtrefcountbt_diff_two_keys( +STATIC int +xfs_rtrefcountbt_cmp_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2, @@ -179,8 +179,8 @@ xfs_rtrefcountbt_diff_two_keys( { ASSERT(!mask || mask->refc.rc_startblock); - return (int64_t)be32_to_cpu(k1->refc.rc_startblock) - - be32_to_cpu(k2->refc.rc_startblock); + return cmp_int(be32_to_cpu(k1->refc.rc_startblock), + be32_to_cpu(k2->refc.rc_startblock)); } static xfs_failaddr_t @@ -387,9 +387,9 @@ const struct xfs_btree_ops xfs_rtrefcountbt_ops = { .init_high_key_from_rec = xfs_rtrefcountbt_init_high_key_from_rec, .init_rec_from_cur = xfs_rtrefcountbt_init_rec_from_cur, .init_ptr_from_cur = xfs_rtrefcountbt_init_ptr_from_cur, - .key_diff = xfs_rtrefcountbt_key_diff, + .cmp_key_with_cur = xfs_rtrefcountbt_cmp_key_with_cur, .buf_ops = &xfs_rtrefcountbt_buf_ops, - .diff_two_keys = xfs_rtrefcountbt_diff_two_keys, + .cmp_two_keys = xfs_rtrefcountbt_cmp_two_keys, .keys_inorder = xfs_rtrefcountbt_keys_inorder, .recs_inorder = xfs_rtrefcountbt_recs_inorder, .keys_contiguous = xfs_rtrefcountbt_keys_contiguous, diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c index 9bdc2cbfc113..55f903165769 100644 --- a/fs/xfs/libxfs/xfs_rtrmap_btree.c +++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c @@ -185,38 +185,22 @@ static inline uint64_t offset_keymask(uint64_t offset) return offset & ~XFS_RMAP_OFF_UNWRITTEN; } -STATIC int64_t -xfs_rtrmapbt_key_diff( +STATIC int +xfs_rtrmapbt_cmp_key_with_cur( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { struct xfs_rmap_irec *rec = &cur->bc_rec.r; const struct xfs_rmap_key *kp = &key->rmap; - __u64 x, y; - int64_t d; - d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; - if (d) - return d; - - x = be64_to_cpu(kp->rm_owner); - y = rec->rm_owner; - if (x > y) - return 1; - else if (y > x) - return -1; - - x = offset_keymask(be64_to_cpu(kp->rm_offset)); - y = offset_keymask(xfs_rmap_irec_offset_pack(rec)); - if (x > y) - return 1; - else if (y > x) - return -1; - return 0; + return cmp_int(be32_to_cpu(kp->rm_startblock), rec->rm_startblock) ?: + cmp_int(be64_to_cpu(kp->rm_owner), rec->rm_owner) ?: + cmp_int(offset_keymask(be64_to_cpu(kp->rm_offset)), + offset_keymask(xfs_rmap_irec_offset_pack(rec))); } -STATIC int64_t -xfs_rtrmapbt_diff_two_keys( +STATIC int +xfs_rtrmapbt_cmp_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2, @@ -224,36 +208,31 @@ xfs_rtrmapbt_diff_two_keys( { const struct xfs_rmap_key *kp1 = &k1->rmap; const struct xfs_rmap_key *kp2 = &k2->rmap; - int64_t d; - __u64 x, y; + int d; /* Doesn't make sense to mask off the physical space part */ ASSERT(!mask || mask->rmap.rm_startblock); - d = (int64_t)be32_to_cpu(kp1->rm_startblock) - - be32_to_cpu(kp2->rm_startblock); + d = cmp_int(be32_to_cpu(kp1->rm_startblock), + be32_to_cpu(kp2->rm_startblock)); if (d) return d; if (!mask || mask->rmap.rm_owner) { - x = be64_to_cpu(kp1->rm_owner); - y = be64_to_cpu(kp2->rm_owner); - if (x > y) - return 1; - else if (y > x) - return -1; + d = cmp_int(be64_to_cpu(kp1->rm_owner), + be64_to_cpu(kp2->rm_owner)); + if (d) + return d; } if (!mask || mask->rmap.rm_offset) { /* Doesn't make sense to allow offset but not owner */ ASSERT(!mask || mask->rmap.rm_owner); - x = offset_keymask(be64_to_cpu(kp1->rm_offset)); - y = offset_keymask(be64_to_cpu(kp2->rm_offset)); - if (x > y) - return 1; - else if (y > x) - return -1; + d = cmp_int(offset_keymask(be64_to_cpu(kp1->rm_offset)), + offset_keymask(be64_to_cpu(kp2->rm_offset))); + if (d) + return d; } return 0; @@ -511,9 +490,9 @@ const struct xfs_btree_ops xfs_rtrmapbt_ops = { .init_high_key_from_rec = xfs_rtrmapbt_init_high_key_from_rec, .init_rec_from_cur = xfs_rtrmapbt_init_rec_from_cur, .init_ptr_from_cur = xfs_rtrmapbt_init_ptr_from_cur, - .key_diff = xfs_rtrmapbt_key_diff, + .cmp_key_with_cur = xfs_rtrmapbt_cmp_key_with_cur, .buf_ops = &xfs_rtrmapbt_buf_ops, - .diff_two_keys = xfs_rtrmapbt_diff_two_keys, + .cmp_two_keys = xfs_rtrmapbt_cmp_two_keys, .keys_inorder = xfs_rtrmapbt_keys_inorder, .recs_inorder = xfs_rtrmapbt_recs_inorder, .keys_contiguous = xfs_rtrmapbt_keys_contiguous, @@ -620,9 +599,9 @@ const struct xfs_btree_ops xfs_rtrmapbt_mem_ops = { .init_high_key_from_rec = xfs_rtrmapbt_init_high_key_from_rec, .init_rec_from_cur = xfs_rtrmapbt_init_rec_from_cur, .init_ptr_from_cur = xfbtree_init_ptr_from_cur, - .key_diff = xfs_rtrmapbt_key_diff, + .cmp_key_with_cur = xfs_rtrmapbt_cmp_key_with_cur, .buf_ops = &xfs_rtrmapbt_mem_buf_ops, - .diff_two_keys = xfs_rtrmapbt_diff_two_keys, + .cmp_two_keys = xfs_rtrmapbt_cmp_two_keys, .keys_inorder = xfs_rtrmapbt_keys_inorder, .recs_inorder = xfs_rtrmapbt_recs_inorder, .keys_contiguous = xfs_rtrmapbt_keys_contiguous, diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index fe678a0438bc..cd6f0ff382a7 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -306,7 +306,7 @@ xchk_btree_block_check_sibling( if (pbp) xchk_buffer_recheck(bs->sc, pbp); - if (xfs_btree_diff_two_ptrs(cur, pp, sibling)) + if (xfs_btree_cmp_two_ptrs(cur, pp, sibling)) xchk_btree_set_corrupt(bs->sc, cur, level); out: xfs_btree_del_cursor(ncur, XFS_BTREE_ERROR); diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 28ad341df8ee..2ef7742be7d3 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -866,11 +866,11 @@ xchk_trans_cancel( sc->tp = NULL; } -int +void xchk_trans_alloc_empty( struct xfs_scrub *sc) { - return xfs_trans_alloc_empty(sc->mp, &sc->tp); + sc->tp = xfs_trans_alloc_empty(sc->mp); } /* @@ -892,7 +892,8 @@ xchk_trans_alloc( return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate, resblks, 0, 0, &sc->tp); - return xchk_trans_alloc_empty(sc); + xchk_trans_alloc_empty(sc); + return 0; } /* Set us up with a transaction and an empty context. */ diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 19877d99f255..ddbc065c798c 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -7,7 +7,7 @@ #define __XFS_SCRUB_COMMON_H__ int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks); -int xchk_trans_alloc_empty(struct xfs_scrub *sc); +void xchk_trans_alloc_empty(struct xfs_scrub *sc); void xchk_trans_cancel(struct xfs_scrub *sc); bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno, diff --git a/fs/xfs/scrub/dir_repair.c b/fs/xfs/scrub/dir_repair.c index 249313882108..8d3b550990b5 100644 --- a/fs/xfs/scrub/dir_repair.c +++ b/fs/xfs/scrub/dir_repair.c @@ -1289,9 +1289,7 @@ xrep_dir_scan_dirtree( if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - error = xchk_trans_alloc_empty(sc); - if (error) - return error; + xchk_trans_alloc_empty(sc); while ((error = xchk_iscan_iter(&rd->pscan.iscan, &ip)) == 1) { bool flush; @@ -1317,9 +1315,7 @@ xrep_dir_scan_dirtree( if (error) break; - error = xchk_trans_alloc_empty(sc); - if (error) - break; + xchk_trans_alloc_empty(sc); } if (xchk_should_terminate(sc, &error)) diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 9b598c5790ad..cebd0d526926 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -237,7 +237,8 @@ xchk_setup_fscounters( return error; } - return xchk_trans_alloc_empty(sc); + xchk_trans_alloc_empty(sc); + return 0; } /* diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c index e21c16fbd15d..14939d7de349 100644 --- a/fs/xfs/scrub/metapath.c +++ b/fs/xfs/scrub/metapath.c @@ -318,9 +318,7 @@ xchk_metapath( return 0; } - error = xchk_trans_alloc_empty(sc); - if (error) - return error; + xchk_trans_alloc_empty(sc); error = xchk_metapath_ilock_both(mpath); if (error) diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c index 4a47d0aabf73..26721fab5cab 100644 --- a/fs/xfs/scrub/nlinks.c +++ b/fs/xfs/scrub/nlinks.c @@ -555,9 +555,7 @@ xchk_nlinks_collect( * do not take sb_internal. */ xchk_trans_cancel(sc); - error = xchk_trans_alloc_empty(sc); - if (error) - return error; + xchk_trans_alloc_empty(sc); while ((error = xchk_iscan_iter(&xnc->collect_iscan, &ip)) == 1) { if (S_ISDIR(VFS_I(ip)->i_mode)) @@ -880,9 +878,7 @@ xchk_nlinks_compare( * inactivation workqueue. */ xchk_trans_cancel(sc); - error = xchk_trans_alloc_empty(sc); - if (error) - return error; + xchk_trans_alloc_empty(sc); /* * Use the inobt to walk all allocated inodes to compare the link diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c index 4ebdee095428..6ef2ee9c3814 100644 --- a/fs/xfs/scrub/nlinks_repair.c +++ b/fs/xfs/scrub/nlinks_repair.c @@ -340,9 +340,7 @@ xrep_nlinks( * We can only push the inactivation workqueues with an empty * transaction. */ - error = xchk_trans_alloc_empty(sc); - if (error) - break; + xchk_trans_alloc_empty(sc); } xchk_iscan_iter_finish(&xnc->compare_iscan); xchk_iscan_teardown(&xnc->compare_iscan); diff --git a/fs/xfs/scrub/parent_repair.c b/fs/xfs/scrub/parent_repair.c index 31bfe10be22a..2949feda6271 100644 --- a/fs/xfs/scrub/parent_repair.c +++ b/fs/xfs/scrub/parent_repair.c @@ -569,9 +569,7 @@ xrep_parent_scan_dirtree( if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - error = xchk_trans_alloc_empty(sc); - if (error) - return error; + xchk_trans_alloc_empty(sc); while ((error = xchk_iscan_iter(&rp->pscan.iscan, &ip)) == 1) { bool flush; @@ -597,9 +595,7 @@ xrep_parent_scan_dirtree( if (error) break; - error = xchk_trans_alloc_empty(sc); - if (error) - break; + xchk_trans_alloc_empty(sc); } if (xchk_should_terminate(sc, &error)) @@ -1099,9 +1095,7 @@ xrep_parent_flush_xattrs( xrep_tempfile_iounlock(rp->sc); /* Recreate the empty transaction and relock the inode. */ - error = xchk_trans_alloc_empty(rp->sc); - if (error) - return error; + xchk_trans_alloc_empty(rp->sc); xchk_ilock(rp->sc, XFS_ILOCK_EXCL); return 0; } diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c index dc4033b91e44..e4105aaafe84 100644 --- a/fs/xfs/scrub/quotacheck.c +++ b/fs/xfs/scrub/quotacheck.c @@ -505,9 +505,7 @@ xqcheck_collect_counts( * transactions do not take sb_internal. */ xchk_trans_cancel(sc); - error = xchk_trans_alloc_empty(sc); - if (error) - return error; + xchk_trans_alloc_empty(sc); while ((error = xchk_iscan_iter(&xqc->iscan, &ip)) == 1) { error = xqcheck_collect_inode(xqc, ip); diff --git a/fs/xfs/scrub/rcbag_btree.c b/fs/xfs/scrub/rcbag_btree.c index 709356dc6256..9a4ef823c5a7 100644 --- a/fs/xfs/scrub/rcbag_btree.c +++ b/fs/xfs/scrub/rcbag_btree.c @@ -47,29 +47,20 @@ rcbagbt_init_rec_from_cur( bag_rec->rbg_refcount = bag_irec->rbg_refcount; } -STATIC int64_t -rcbagbt_key_diff( +STATIC int +rcbagbt_cmp_key_with_cur( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { struct rcbag_rec *rec = (struct rcbag_rec *)&cur->bc_rec; const struct rcbag_key *kp = (const struct rcbag_key *)key; - if (kp->rbg_startblock > rec->rbg_startblock) - return 1; - if (kp->rbg_startblock < rec->rbg_startblock) - return -1; - - if (kp->rbg_blockcount > rec->rbg_blockcount) - return 1; - if (kp->rbg_blockcount < rec->rbg_blockcount) - return -1; - - return 0; + return cmp_int(kp->rbg_startblock, rec->rbg_startblock) ?: + cmp_int(kp->rbg_blockcount, rec->rbg_blockcount); } -STATIC int64_t -rcbagbt_diff_two_keys( +STATIC int +rcbagbt_cmp_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2, @@ -80,17 +71,8 @@ rcbagbt_diff_two_keys( ASSERT(mask == NULL); - if (kp1->rbg_startblock > kp2->rbg_startblock) - return 1; - if (kp1->rbg_startblock < kp2->rbg_startblock) - return -1; - - if (kp1->rbg_blockcount > kp2->rbg_blockcount) - return 1; - if (kp1->rbg_blockcount < kp2->rbg_blockcount) - return -1; - - return 0; + return cmp_int(kp1->rbg_startblock, kp2->rbg_startblock) ?: + cmp_int(kp1->rbg_blockcount, kp2->rbg_blockcount); } STATIC int @@ -201,9 +183,9 @@ static const struct xfs_btree_ops rcbagbt_mem_ops = { .init_key_from_rec = rcbagbt_init_key_from_rec, .init_rec_from_cur = rcbagbt_init_rec_from_cur, .init_ptr_from_cur = xfbtree_init_ptr_from_cur, - .key_diff = rcbagbt_key_diff, + .cmp_key_with_cur = rcbagbt_cmp_key_with_cur, .buf_ops = &rcbagbt_mem_buf_ops, - .diff_two_keys = rcbagbt_diff_two_keys, + .cmp_two_keys = rcbagbt_cmp_two_keys, .keys_inorder = rcbagbt_keys_inorder, .recs_inorder = rcbagbt_recs_inorder, }; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index f8f9ed30f56b..d00c18954a26 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -1269,42 +1269,6 @@ xrep_setup_xfbtree( } /* - * Create a dummy transaction for use in a live update hook function. This - * function MUST NOT be called from regular repair code because the current - * process' transaction is saved via the cookie. - */ -int -xrep_trans_alloc_hook_dummy( - struct xfs_mount *mp, - void **cookiep, - struct xfs_trans **tpp) -{ - int error; - - *cookiep = current->journal_info; - current->journal_info = NULL; - - error = xfs_trans_alloc_empty(mp, tpp); - if (!error) - return 0; - - current->journal_info = *cookiep; - *cookiep = NULL; - return error; -} - -/* Cancel a dummy transaction used by a live update hook function. */ -void -xrep_trans_cancel_hook_dummy( - void **cookiep, - struct xfs_trans *tp) -{ - xfs_trans_cancel(tp); - current->journal_info = *cookiep; - *cookiep = NULL; -} - -/* * See if this buffer can pass the given ->verify_struct() function. * * If the buffer already has ops attached and they're not the ones that were diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index af0a3a9e5ed9..9c04295742c8 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -180,10 +180,6 @@ int xrep_quotacheck(struct xfs_scrub *sc); int xrep_reinit_pagf(struct xfs_scrub *sc); int xrep_reinit_pagi(struct xfs_scrub *sc); -int xrep_trans_alloc_hook_dummy(struct xfs_mount *mp, void **cookiep, - struct xfs_trans **tpp); -void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp); - bool xrep_buf_verify_struct(struct xfs_buf *bp, const struct xfs_buf_ops *ops); void xrep_inode_set_nblocks(struct xfs_scrub *sc, int64_t new_blocks); int xrep_reset_metafile_resv(struct xfs_scrub *sc); diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c index f5f73078ffe2..17d4a38d735c 100644 --- a/fs/xfs/scrub/rmap_repair.c +++ b/fs/xfs/scrub/rmap_repair.c @@ -951,9 +951,7 @@ end_agscan: sa->agf_bp = NULL; sa->agi_bp = NULL; xchk_trans_cancel(sc); - error = xchk_trans_alloc_empty(sc); - if (error) - return error; + xchk_trans_alloc_empty(sc); /* Iterate all AGs for inodes rmaps. */ while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) { @@ -1612,7 +1610,6 @@ xrep_rmapbt_live_update( struct xfs_mount *mp; struct xfs_btree_cur *mcur; struct xfs_trans *tp; - void *txcookie; int error; rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb); @@ -1623,9 +1620,7 @@ xrep_rmapbt_live_update( trace_xrep_rmap_live_update(pag_group(rr->sc->sa.pag), action, p); - error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp); - if (error) - goto out_abort; + tp = xfs_trans_alloc_empty(mp); mutex_lock(&rr->lock); mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree); @@ -1639,14 +1634,13 @@ xrep_rmapbt_live_update( if (error) goto out_cancel; - xrep_trans_cancel_hook_dummy(&txcookie, tp); + xfs_trans_cancel(tp); mutex_unlock(&rr->lock); return NOTIFY_DONE; out_cancel: xfbtree_trans_cancel(&rr->rmap_btree, tp); - xrep_trans_cancel_hook_dummy(&txcookie, tp); -out_abort: + xfs_trans_cancel(tp); mutex_unlock(&rr->lock); xchk_iscan_abort(&rr->iscan); out_unlock: diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c index fc2592c53af5..7561941a337a 100644 --- a/fs/xfs/scrub/rtrmap_repair.c +++ b/fs/xfs/scrub/rtrmap_repair.c @@ -580,9 +580,7 @@ xrep_rtrmap_find_rmaps( */ xchk_trans_cancel(sc); xchk_rtgroup_unlock(&sc->sr); - error = xchk_trans_alloc_empty(sc); - if (error) - return error; + xchk_trans_alloc_empty(sc); while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) { error = xrep_rtrmap_scan_inode(rr, ip); @@ -846,7 +844,6 @@ xrep_rtrmapbt_live_update( struct xfs_mount *mp; struct xfs_btree_cur *mcur; struct xfs_trans *tp; - void *txcookie; int error; rr = container_of(nb, struct xrep_rtrmap, rhook.rmap_hook.nb); @@ -857,9 +854,7 @@ xrep_rtrmapbt_live_update( trace_xrep_rmap_live_update(rtg_group(rr->sc->sr.rtg), action, p); - error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp); - if (error) - goto out_abort; + tp = xfs_trans_alloc_empty(mp); mutex_lock(&rr->lock); mcur = xfs_rtrmapbt_mem_cursor(rr->sc->sr.rtg, tp, &rr->rtrmap_btree); @@ -873,14 +868,13 @@ xrep_rtrmapbt_live_update( if (error) goto out_cancel; - xrep_trans_cancel_hook_dummy(&txcookie, tp); + xfs_trans_cancel(tp); mutex_unlock(&rr->lock); return NOTIFY_DONE; out_cancel: xfbtree_trans_cancel(&rr->rtrmap_btree, tp); - xrep_trans_cancel_hook_dummy(&txcookie, tp); -out_abort: + xfs_trans_cancel(tp); xchk_iscan_abort(&rr->iscan); mutex_unlock(&rr->lock); out_unlock: diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 76e24032e99a..3c3b0d25006f 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -876,10 +876,7 @@ xchk_scrubv_open_by_handle( struct xfs_inode *ip; int error; - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - return NULL; - + tp = xfs_trans_alloc_empty(mp); error = xfs_iget(mp, tp, head->svh_ino, XCHK_IGET_FLAGS, 0, &ip); xfs_trans_cancel(tp); if (error) diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index d7c4ced47c15..1e6e9c10cea2 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -2996,7 +2996,7 @@ DEFINE_EVENT(xrep_pptr_salvage_class, name, \ DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_salvage_pptr); DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_insert_pptr); -TRACE_EVENT(xrep_xattr_class, +DECLARE_EVENT_CLASS(xrep_xattr_class, TP_PROTO(struct xfs_inode *ip, struct xfs_inode *arg_ip), TP_ARGS(ip, arg_ip), TP_STRUCT__entry( diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 26a04a783489..1ee4f835ac3c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -234,6 +234,47 @@ xfs_end_bio( } /* + * We cannot cancel the ioend directly on error. We may have already set other + * pages under writeback and hence we have to run I/O completion to mark the + * error state of the pages under writeback appropriately. + * + * If the folio has delalloc blocks on it, the caller is asking us to punch them + * out. If we don't, we can leave a stale delalloc mapping covered by a clean + * page that needs to be dirtied again before the delalloc mapping can be + * converted. This stale delalloc mapping can trip up a later direct I/O read + * operation on the same region. + * + * We prevent this by truncating away the delalloc regions on the folio. Because + * they are delalloc, we can do this without needing a transaction. Indeed - if + * we get ENOSPC errors, we have to be able to do this truncation without a + * transaction as there is no space left for block reservation (typically why + * we see a ENOSPC in writeback). + */ +static void +xfs_discard_folio( + struct folio *folio, + loff_t pos) +{ + struct xfs_inode *ip = XFS_I(folio->mapping->host); + struct xfs_mount *mp = ip->i_mount; + + if (xfs_is_shutdown(mp)) + return; + + xfs_alert_ratelimited(mp, + "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.", + folio, ip->i_ino, pos); + + /* + * The end of the punch range is always the offset of the first + * byte of the next folio. Hence the end offset is only dependent on the + * folio itself and not the start offset that is passed in. + */ + xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos, + folio_pos(folio) + folio_size(folio), NULL); +} + +/* * Fast revalidation of the cached writeback mapping. Return true if the current * mapping is valid, false otherwise. */ @@ -278,13 +319,12 @@ xfs_imap_valid( static int xfs_map_blocks( struct iomap_writepage_ctx *wpc, - struct inode *inode, loff_t offset, unsigned int len) { - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(wpc->inode); struct xfs_mount *mp = ip->i_mount; - ssize_t count = i_blocksize(inode); + ssize_t count = i_blocksize(wpc->inode); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); xfs_fileoff_t cow_fsb; @@ -436,81 +476,78 @@ allocate_blocks: return 0; } -static int -xfs_submit_ioend( +static ssize_t +xfs_writeback_range( struct iomap_writepage_ctx *wpc, - int status) + struct folio *folio, + u64 offset, + unsigned int len, + u64 end_pos) { - struct iomap_ioend *ioend = wpc->ioend; - unsigned int nofs_flag; - - /* - * We can allocate memory here while doing writeback on behalf of - * memory reclaim. To avoid memory allocation deadlocks set the - * task-wide nofs context for the following operations. - */ - nofs_flag = memalloc_nofs_save(); + ssize_t ret; + + ret = xfs_map_blocks(wpc, offset, len); + if (!ret) + ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len); + if (ret < 0) + xfs_discard_folio(folio, offset); + return ret; +} - /* Convert CoW extents to regular */ - if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) { - status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), - ioend->io_offset, ioend->io_size); - } +static bool +xfs_ioend_needs_wq_completion( + struct iomap_ioend *ioend) +{ + /* Changing inode size requires a transaction. */ + if (xfs_ioend_is_append(ioend)) + return true; - memalloc_nofs_restore(nofs_flag); + /* Extent manipulation requires a transaction. */ + if (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED)) + return true; - /* send ioends that might require a transaction to the completion wq */ - if (xfs_ioend_is_append(ioend) || - (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))) - ioend->io_bio.bi_end_io = xfs_end_bio; + /* Page cache invalidation cannot be done in irq context. */ + if (ioend->io_flags & IOMAP_IOEND_DONTCACHE) + return true; - if (status) - return status; - submit_bio(&ioend->io_bio); - return 0; + return false; } -/* - * If the folio has delalloc blocks on it, the caller is asking us to punch them - * out. If we don't, we can leave a stale delalloc mapping covered by a clean - * page that needs to be dirtied again before the delalloc mapping can be - * converted. This stale delalloc mapping can trip up a later direct I/O read - * operation on the same region. - * - * We prevent this by truncating away the delalloc regions on the folio. Because - * they are delalloc, we can do this without needing a transaction. Indeed - if - * we get ENOSPC errors, we have to be able to do this truncation without a - * transaction as there is no space left for block reservation (typically why - * we see a ENOSPC in writeback). - */ -static void -xfs_discard_folio( - struct folio *folio, - loff_t pos) +static int +xfs_writeback_submit( + struct iomap_writepage_ctx *wpc, + int error) { - struct xfs_inode *ip = XFS_I(folio->mapping->host); - struct xfs_mount *mp = ip->i_mount; + struct iomap_ioend *ioend = wpc->wb_ctx; - if (xfs_is_shutdown(mp)) - return; + /* + * Convert CoW extents to regular. + * + * We can allocate memory here while doing writeback on behalf of memory + * reclaim. To avoid memory allocation deadlocks, set the task-wide + * nofs context. + */ + if (!error && (ioend->io_flags & IOMAP_IOEND_SHARED)) { + unsigned int nofs_flag; - xfs_alert_ratelimited(mp, - "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.", - folio, ip->i_ino, pos); + nofs_flag = memalloc_nofs_save(); + error = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), + ioend->io_offset, ioend->io_size); + memalloc_nofs_restore(nofs_flag); + } /* - * The end of the punch range is always the offset of the first - * byte of the next folio. Hence the end offset is only dependent on the - * folio itself and not the start offset that is passed in. + * Send ioends that might require a transaction to the completion wq. */ - xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos, - folio_pos(folio) + folio_size(folio), NULL); + if (xfs_ioend_needs_wq_completion(ioend)) + ioend->io_bio.bi_end_io = xfs_end_bio; + + return iomap_ioend_writeback_submit(wpc, error); } static const struct iomap_writeback_ops xfs_writeback_ops = { - .map_blocks = xfs_map_blocks, - .submit_ioend = xfs_submit_ioend, - .discard_folio = xfs_discard_folio, + .writeback_range = xfs_writeback_range, + .writeback_submit = xfs_writeback_submit, }; struct xfs_zoned_writepage_ctx { @@ -527,11 +564,10 @@ XFS_ZWPC(struct iomap_writepage_ctx *ctx) static int xfs_zoned_map_blocks( struct iomap_writepage_ctx *wpc, - struct inode *inode, loff_t offset, unsigned int len) { - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(wpc->inode); struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len); @@ -590,22 +626,44 @@ xfs_zoned_map_blocks( return 0; } -static int -xfs_zoned_submit_ioend( +static ssize_t +xfs_zoned_writeback_range( struct iomap_writepage_ctx *wpc, - int status) + struct folio *folio, + u64 offset, + unsigned int len, + u64 end_pos) { - wpc->ioend->io_bio.bi_end_io = xfs_end_bio; - if (status) - return status; - xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->open_zone); + ssize_t ret; + + ret = xfs_zoned_map_blocks(wpc, offset, len); + if (!ret) + ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len); + if (ret < 0) + xfs_discard_folio(folio, offset); + return ret; +} + +static int +xfs_zoned_writeback_submit( + struct iomap_writepage_ctx *wpc, + int error) +{ + struct iomap_ioend *ioend = wpc->wb_ctx; + + ioend->io_bio.bi_end_io = xfs_end_bio; + if (error) { + ioend->io_bio.bi_status = errno_to_blk_status(error); + bio_endio(&ioend->io_bio); + return error; + } + xfs_zone_alloc_and_submit(ioend, &XFS_ZWPC(wpc)->open_zone); return 0; } static const struct iomap_writeback_ops xfs_zoned_writeback_ops = { - .map_blocks = xfs_zoned_map_blocks, - .submit_ioend = xfs_zoned_submit_ioend, - .discard_folio = xfs_discard_folio, + .writeback_range = xfs_zoned_writeback_range, + .writeback_submit = xfs_zoned_writeback_submit, }; STATIC int @@ -618,19 +676,29 @@ xfs_vm_writepages( xfs_iflags_clear(ip, XFS_ITRUNCATED); if (xfs_is_zoned_inode(ip)) { - struct xfs_zoned_writepage_ctx xc = { }; + struct xfs_zoned_writepage_ctx xc = { + .ctx = { + .inode = mapping->host, + .wbc = wbc, + .ops = &xfs_zoned_writeback_ops + }, + }; int error; - error = iomap_writepages(mapping, wbc, &xc.ctx, - &xfs_zoned_writeback_ops); + error = iomap_writepages(&xc.ctx); if (xc.open_zone) xfs_open_zone_put(xc.open_zone); return error; } else { - struct xfs_writepage_ctx wpc = { }; - - return iomap_writepages(mapping, wbc, &wpc.ctx, - &xfs_writeback_ops); + struct xfs_writepage_ctx wpc = { + .ctx = { + .inode = mapping->host, + .wbc = wbc, + .ops = &xfs_writeback_ops + }, + }; + + return iomap_writepages(&wpc.ctx); } } diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index f683b7a9323f..5eef3bc30bda 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -91,41 +91,37 @@ xfs_attri_log_nameval_alloc( name_len + new_name_len + value_len + new_value_len); - nv->name.i_addr = nv + 1; - nv->name.i_len = name_len; - nv->name.i_type = XLOG_REG_TYPE_ATTR_NAME; - memcpy(nv->name.i_addr, name, name_len); + nv->name.iov_base = nv + 1; + nv->name.iov_len = name_len; + memcpy(nv->name.iov_base, name, name_len); if (new_name_len) { - nv->new_name.i_addr = nv->name.i_addr + name_len; - nv->new_name.i_len = new_name_len; - memcpy(nv->new_name.i_addr, new_name, new_name_len); + nv->new_name.iov_base = nv->name.iov_base + name_len; + nv->new_name.iov_len = new_name_len; + memcpy(nv->new_name.iov_base, new_name, new_name_len); } else { - nv->new_name.i_addr = NULL; - nv->new_name.i_len = 0; + nv->new_name.iov_base = NULL; + nv->new_name.iov_len = 0; } - nv->new_name.i_type = XLOG_REG_TYPE_ATTR_NEWNAME; if (value_len) { - nv->value.i_addr = nv->name.i_addr + name_len + new_name_len; - nv->value.i_len = value_len; - memcpy(nv->value.i_addr, value, value_len); + nv->value.iov_base = nv->name.iov_base + name_len + new_name_len; + nv->value.iov_len = value_len; + memcpy(nv->value.iov_base, value, value_len); } else { - nv->value.i_addr = NULL; - nv->value.i_len = 0; + nv->value.iov_base = NULL; + nv->value.iov_len = 0; } - nv->value.i_type = XLOG_REG_TYPE_ATTR_VALUE; if (new_value_len) { - nv->new_value.i_addr = nv->name.i_addr + name_len + + nv->new_value.iov_base = nv->name.iov_base + name_len + new_name_len + value_len; - nv->new_value.i_len = new_value_len; - memcpy(nv->new_value.i_addr, new_value, new_value_len); + nv->new_value.iov_len = new_value_len; + memcpy(nv->new_value.iov_base, new_value, new_value_len); } else { - nv->new_value.i_addr = NULL; - nv->new_value.i_len = 0; + nv->new_value.iov_base = NULL; + nv->new_value.iov_len = 0; } - nv->new_value.i_type = XLOG_REG_TYPE_ATTR_NEWVALUE; refcount_set(&nv->refcount, 1); return nv; @@ -170,21 +166,21 @@ xfs_attri_item_size( *nvecs += 2; *nbytes += sizeof(struct xfs_attri_log_format) + - xlog_calc_iovec_len(nv->name.i_len); + xlog_calc_iovec_len(nv->name.iov_len); - if (nv->new_name.i_len) { + if (nv->new_name.iov_len) { *nvecs += 1; - *nbytes += xlog_calc_iovec_len(nv->new_name.i_len); + *nbytes += xlog_calc_iovec_len(nv->new_name.iov_len); } - if (nv->value.i_len) { + if (nv->value.iov_len) { *nvecs += 1; - *nbytes += xlog_calc_iovec_len(nv->value.i_len); + *nbytes += xlog_calc_iovec_len(nv->value.iov_len); } - if (nv->new_value.i_len) { + if (nv->new_value.iov_len) { *nvecs += 1; - *nbytes += xlog_calc_iovec_len(nv->new_value.i_len); + *nbytes += xlog_calc_iovec_len(nv->new_value.iov_len); } } @@ -212,31 +208,36 @@ xfs_attri_item_format( * the log recovery. */ - ASSERT(nv->name.i_len > 0); + ASSERT(nv->name.iov_len > 0); attrip->attri_format.alfi_size++; - if (nv->new_name.i_len > 0) + if (nv->new_name.iov_len > 0) attrip->attri_format.alfi_size++; - if (nv->value.i_len > 0) + if (nv->value.iov_len > 0) attrip->attri_format.alfi_size++; - if (nv->new_value.i_len > 0) + if (nv->new_value.iov_len > 0) attrip->attri_format.alfi_size++; xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT, &attrip->attri_format, sizeof(struct xfs_attri_log_format)); - xlog_copy_from_iovec(lv, &vecp, &nv->name); - if (nv->new_name.i_len > 0) - xlog_copy_from_iovec(lv, &vecp, &nv->new_name); + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NAME, nv->name.iov_base, + nv->name.iov_len); - if (nv->value.i_len > 0) - xlog_copy_from_iovec(lv, &vecp, &nv->value); + if (nv->new_name.iov_len > 0) + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NEWNAME, + nv->new_name.iov_base, nv->new_name.iov_len); - if (nv->new_value.i_len > 0) - xlog_copy_from_iovec(lv, &vecp, &nv->new_value); + if (nv->value.iov_len > 0) + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_VALUE, + nv->value.iov_base, nv->value.iov_len); + + if (nv->new_value.iov_len > 0) + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NEWVALUE, + nv->new_value.iov_base, nv->new_value.iov_len); } /* @@ -383,22 +384,22 @@ xfs_attr_log_item( attrp->alfi_ino = args->dp->i_ino; ASSERT(!(attr->xattri_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK)); attrp->alfi_op_flags = attr->xattri_op_flags; - attrp->alfi_value_len = nv->value.i_len; + attrp->alfi_value_len = nv->value.iov_len; switch (xfs_attr_log_item_op(attrp)) { case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: - ASSERT(nv->value.i_len == nv->new_value.i_len); + ASSERT(nv->value.iov_len == nv->new_value.iov_len); attrp->alfi_igen = VFS_I(args->dp)->i_generation; - attrp->alfi_old_name_len = nv->name.i_len; - attrp->alfi_new_name_len = nv->new_name.i_len; + attrp->alfi_old_name_len = nv->name.iov_len; + attrp->alfi_new_name_len = nv->new_name.iov_len; break; case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: case XFS_ATTRI_OP_FLAGS_PPTR_SET: attrp->alfi_igen = VFS_I(args->dp)->i_generation; fallthrough; default: - attrp->alfi_name_len = nv->name.i_len; + attrp->alfi_name_len = nv->name.iov_len; break; } @@ -616,10 +617,7 @@ xfs_attri_iread_extents( struct xfs_trans *tp; int error; - error = xfs_trans_alloc_empty(ip->i_mount, &tp); - if (error) - return error; - + tp = xfs_trans_alloc_empty(ip->i_mount); xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_iread_extents(tp, ip, XFS_ATTR_FORK); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -690,14 +688,14 @@ xfs_attri_recover_work( args->dp = ip; args->geo = mp->m_attr_geo; args->whichfork = XFS_ATTR_FORK; - args->name = nv->name.i_addr; - args->namelen = nv->name.i_len; - args->new_name = nv->new_name.i_addr; - args->new_namelen = nv->new_name.i_len; - args->value = nv->value.i_addr; - args->valuelen = nv->value.i_len; - args->new_value = nv->new_value.i_addr; - args->new_valuelen = nv->new_value.i_len; + args->name = nv->name.iov_base; + args->namelen = nv->name.iov_len; + args->new_name = nv->new_name.iov_base; + args->new_namelen = nv->new_name.iov_len; + args->value = nv->value.iov_base; + args->valuelen = nv->value.iov_len; + args->new_value = nv->new_value.iov_base; + args->new_valuelen = nv->new_value.iov_len; args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK; args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT | XFS_DA_OP_LOGGED; @@ -754,8 +752,8 @@ xfs_attr_recover_work( */ attrp = &attrip->attri_format; if (!xfs_attri_validate(mp, attrp) || - !xfs_attr_namecheck(attrp->alfi_attr_filter, nv->name.i_addr, - nv->name.i_len)) + !xfs_attr_namecheck(attrp->alfi_attr_filter, nv->name.iov_base, + nv->name.iov_len)) return -EFSCORRUPTED; attr = xfs_attri_recover_work(mp, dfp, attrp, &ip, nv); @@ -953,50 +951,50 @@ static inline void * xfs_attri_validate_name_iovec( struct xfs_mount *mp, struct xfs_attri_log_format *attri_formatp, - const struct xfs_log_iovec *iovec, + const struct kvec *iovec, unsigned int name_len) { - if (iovec->i_len != xlog_calc_iovec_len(name_len)) { + if (iovec->iov_len != xlog_calc_iovec_len(name_len)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, sizeof(*attri_formatp)); return NULL; } - if (!xfs_attr_namecheck(attri_formatp->alfi_attr_filter, iovec->i_addr, + if (!xfs_attr_namecheck(attri_formatp->alfi_attr_filter, iovec->iov_base, name_len)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, sizeof(*attri_formatp)); XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - iovec->i_addr, iovec->i_len); + iovec->iov_base, iovec->iov_len); return NULL; } - return iovec->i_addr; + return iovec->iov_base; } static inline void * xfs_attri_validate_value_iovec( struct xfs_mount *mp, struct xfs_attri_log_format *attri_formatp, - const struct xfs_log_iovec *iovec, + const struct kvec *iovec, unsigned int value_len) { - if (iovec->i_len != xlog_calc_iovec_len(value_len)) { + if (iovec->iov_len != xlog_calc_iovec_len(value_len)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, sizeof(*attri_formatp)); return NULL; } if ((attri_formatp->alfi_attr_filter & XFS_ATTR_PARENT) && - !xfs_parent_valuecheck(mp, iovec->i_addr, value_len)) { + !xfs_parent_valuecheck(mp, iovec->iov_base, value_len)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, sizeof(*attri_formatp)); XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - iovec->i_addr, iovec->i_len); + iovec->iov_base, iovec->iov_len); return NULL; } - return iovec->i_addr; + return iovec->iov_base; } STATIC int @@ -1023,13 +1021,13 @@ xlog_recover_attri_commit_pass2( /* Validate xfs_attri_log_format before the large memory allocation */ len = sizeof(struct xfs_attri_log_format); - if (item->ri_buf[i].i_len != len) { + if (item->ri_buf[i].iov_len != len) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } - attri_formatp = item->ri_buf[i].i_addr; + attri_formatp = item->ri_buf[i].iov_base; if (!xfs_attri_validate(mp, attri_formatp)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, len); @@ -1218,10 +1216,10 @@ xlog_recover_attrd_commit_pass2( { struct xfs_attrd_log_format *attrd_formatp; - attrd_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_attrd_log_format)) { + attrd_formatp = item->ri_buf[0].iov_base; + if (item->ri_buf[0].iov_len != sizeof(struct xfs_attrd_log_format)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h index e74128cbb722..d108a11b55ae 100644 --- a/fs/xfs/xfs_attr_item.h +++ b/fs/xfs/xfs_attr_item.h @@ -12,10 +12,10 @@ struct xfs_mount; struct kmem_zone; struct xfs_attri_log_nameval { - struct xfs_log_iovec name; - struct xfs_log_iovec new_name; /* PPTR_REPLACE only */ - struct xfs_log_iovec value; - struct xfs_log_iovec new_value; /* PPTR_REPLACE only */ + struct kvec name; + struct kvec new_name; /* PPTR_REPLACE only */ + struct kvec value; + struct kvec new_value; /* PPTR_REPLACE only */ refcount_t refcount; /* name and value follow the end of this struct */ diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 646c515ee355..80f0c4bcc483 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -654,24 +654,24 @@ xlog_recover_bui_commit_pass2( struct xfs_bui_log_format *bui_formatp; size_t len; - bui_formatp = item->ri_buf[0].i_addr; + bui_formatp = item->ri_buf[0].iov_base; - if (item->ri_buf[0].i_len < xfs_bui_log_format_sizeof(0)) { + if (item->ri_buf[0].iov_len < xfs_bui_log_format_sizeof(0)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } len = xfs_bui_log_format_sizeof(bui_formatp->bui_nextents); - if (item->ri_buf[0].i_len != len) { + if (item->ri_buf[0].iov_len != len) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } @@ -705,10 +705,10 @@ xlog_recover_bud_commit_pass2( { struct xfs_bud_log_format *bud_formatp; - bud_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) { + bud_formatp = item->ri_buf[0].iov_base; + if (item->ri_buf[0].iov_len != sizeof(struct xfs_bud_log_format)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8af83bd161f9..f9ef3b2a332a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1683,7 +1683,7 @@ xfs_free_buftarg( fs_put_dax(btp->bt_daxdev, btp->bt_mount); /* the main block device is closed by kill_block_super */ if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev) - bdev_fput(btp->bt_bdev_file); + bdev_fput(btp->bt_file); kfree(btp); } @@ -1712,8 +1712,8 @@ xfs_configure_buftarg_atomic_writes( max_bytes = 0; } - btp->bt_bdev_awu_min = min_bytes; - btp->bt_bdev_awu_max = max_bytes; + btp->bt_awu_min = min_bytes; + btp->bt_awu_max = max_bytes; } /* Configure a buffer target that abstracts a block device. */ @@ -1738,14 +1738,9 @@ xfs_configure_buftarg( return -EINVAL; } - /* - * Flush the block device pagecache so our bios see anything dirtied - * before mount. - */ if (bdev_can_atomic_write(btp->bt_bdev)) xfs_configure_buftarg_atomic_writes(btp); - - return sync_blockdev(btp->bt_bdev); + return 0; } int @@ -1803,7 +1798,7 @@ xfs_alloc_buftarg( btp = kzalloc(sizeof(*btp), GFP_KERNEL | __GFP_NOFAIL); btp->bt_mount = mp; - btp->bt_bdev_file = bdev_file; + btp->bt_file = bdev_file; btp->bt_bdev = file_bdev(bdev_file); btp->bt_dev = btp->bt_bdev->bd_dev; btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, @@ -2082,44 +2077,6 @@ xfs_buf_delwri_submit( return error; } -/* - * Push a single buffer on a delwri queue. - * - * The purpose of this function is to submit a single buffer of a delwri queue - * and return with the buffer still on the original queue. - * - * The buffer locking and queue management logic between _delwri_pushbuf() and - * _delwri_queue() guarantee that the buffer cannot be queued to another list - * before returning. - */ -int -xfs_buf_delwri_pushbuf( - struct xfs_buf *bp, - struct list_head *buffer_list) -{ - int error; - - ASSERT(bp->b_flags & _XBF_DELWRI_Q); - - trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_); - - xfs_buf_lock(bp); - bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC); - bp->b_flags |= XBF_WRITE; - xfs_buf_submit(bp); - - /* - * The buffer is now locked, under I/O but still on the original delwri - * queue. Wait for I/O completion, restore the DELWRI_Q flag and - * return with the buffer unlocked and still on the original queue. - */ - error = xfs_buf_iowait(bp); - bp->b_flags |= _XBF_DELWRI_Q; - xfs_buf_unlock(bp); - - return error; -} - void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) { /* diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 9d2ab567cf81..b269e115d9ac 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -94,7 +94,6 @@ void xfs_buf_cache_destroy(struct xfs_buf_cache *bch); */ struct xfs_buftarg { dev_t bt_dev; - struct file *bt_bdev_file; struct block_device *bt_bdev; struct dax_device *bt_daxdev; struct file *bt_file; @@ -112,9 +111,9 @@ struct xfs_buftarg { struct percpu_counter bt_readahead_count; struct ratelimit_state bt_ioerror_rl; - /* Atomic write unit values, bytes */ - unsigned int bt_bdev_awu_min; - unsigned int bt_bdev_awu_max; + /* Hardware atomic write unit values, bytes */ + unsigned int bt_awu_min; + unsigned int bt_awu_max; /* built-in cache, if we're not using the perag one */ struct xfs_buf_cache bt_cache[]; @@ -326,7 +325,6 @@ extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); void xfs_buf_delwri_queue_here(struct xfs_buf *bp, struct list_head *bl); extern int xfs_buf_delwri_submit(struct list_head *); extern int xfs_buf_delwri_submit_nowait(struct list_head *); -extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *); static inline xfs_daddr_t xfs_buf_daddr(struct xfs_buf *bp) { @@ -376,7 +374,6 @@ extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize); -#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 90139e0f3271..8d85b5eee444 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -32,19 +32,74 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_buf_log_item, bli_item); } +static void +xfs_buf_item_get_format( + struct xfs_buf_log_item *bip, + int count) +{ + ASSERT(bip->bli_formats == NULL); + bip->bli_format_count = count; + + if (count == 1) { + bip->bli_formats = &bip->__bli_format; + return; + } + + bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format), + GFP_KERNEL | __GFP_NOFAIL); +} + +static void +xfs_buf_item_free_format( + struct xfs_buf_log_item *bip) +{ + if (bip->bli_formats != &bip->__bli_format) { + kfree(bip->bli_formats); + bip->bli_formats = NULL; + } +} + +static void +xfs_buf_item_free( + struct xfs_buf_log_item *bip) +{ + xfs_buf_item_free_format(bip); + kvfree(bip->bli_item.li_lv_shadow); + kmem_cache_free(xfs_buf_item_cache, bip); +} + +/* + * xfs_buf_item_relse() is called when the buf log item is no longer needed. + */ +static void +xfs_buf_item_relse( + struct xfs_buf_log_item *bip) +{ + struct xfs_buf *bp = bip->bli_buf; + + trace_xfs_buf_item_relse(bp, _RET_IP_); + + ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); + ASSERT(atomic_read(&bip->bli_refcount) == 0); + + bp->b_log_item = NULL; + xfs_buf_rele(bp); + xfs_buf_item_free(bip); +} + /* Is this log iovec plausibly large enough to contain the buffer log format? */ bool xfs_buf_log_check_iovec( - struct xfs_log_iovec *iovec) + struct kvec *iovec) { - struct xfs_buf_log_format *blfp = iovec->i_addr; + struct xfs_buf_log_format *blfp = iovec->iov_base; char *bmp_end; char *item_end; - if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->i_len) + if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->iov_len) return false; - item_end = (char *)iovec->i_addr + iovec->i_len; + item_end = (char *)iovec->iov_base + iovec->iov_len; bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size]; return bmp_end <= item_end; } @@ -390,6 +445,42 @@ xfs_buf_item_pin( } /* + * For a stale BLI, process all the necessary completions that must be + * performed when the final BLI reference goes away. The buffer will be + * referenced and locked here - we return to the caller with the buffer still + * referenced and locked for them to finalise processing of the buffer. + */ +static void +xfs_buf_item_finish_stale( + struct xfs_buf_log_item *bip) +{ + struct xfs_buf *bp = bip->bli_buf; + struct xfs_log_item *lip = &bip->bli_item; + + ASSERT(bip->bli_flags & XFS_BLI_STALE); + ASSERT(xfs_buf_islocked(bp)); + ASSERT(bp->b_flags & XBF_STALE); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(list_empty(&lip->li_trans)); + ASSERT(!bp->b_transp); + + if (bip->bli_flags & XFS_BLI_STALE_INODE) { + xfs_buf_item_done(bp); + xfs_buf_inode_iodone(bp); + ASSERT(list_empty(&bp->b_li_list)); + return; + } + + /* + * We may or may not be on the AIL here, xfs_trans_ail_delete() will do + * the right thing regardless of the situation in which we are called. + */ + xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); + xfs_buf_item_relse(bip); + ASSERT(bp->b_log_item == NULL); +} + +/* * This is called to unpin the buffer associated with the buf log item which was * previously pinned with a call to xfs_buf_item_pin(). We enter this function * with a buffer pin count, a buffer reference and a BLI reference. @@ -438,13 +529,6 @@ xfs_buf_item_unpin( } if (stale) { - ASSERT(bip->bli_flags & XFS_BLI_STALE); - ASSERT(xfs_buf_islocked(bp)); - ASSERT(bp->b_flags & XBF_STALE); - ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); - ASSERT(list_empty(&lip->li_trans)); - ASSERT(!bp->b_transp); - trace_xfs_buf_item_unpin_stale(bip); /* @@ -455,22 +539,7 @@ xfs_buf_item_unpin( * processing is complete. */ xfs_buf_rele(bp); - - /* - * If we get called here because of an IO error, we may or may - * not have the item on the AIL. xfs_trans_ail_delete() will - * take care of that situation. xfs_trans_ail_delete() drops - * the AIL lock. - */ - if (bip->bli_flags & XFS_BLI_STALE_INODE) { - xfs_buf_item_done(bp); - xfs_buf_inode_iodone(bp); - ASSERT(list_empty(&bp->b_li_list)); - } else { - xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); - xfs_buf_item_relse(bp); - ASSERT(bp->b_log_item == NULL); - } + xfs_buf_item_finish_stale(bip); xfs_buf_relse(bp); return; } @@ -543,43 +612,42 @@ xfs_buf_item_push( * Drop the buffer log item refcount and take appropriate action. This helper * determines whether the bli must be freed or not, since a decrement to zero * does not necessarily mean the bli is unused. - * - * Return true if the bli is freed, false otherwise. */ -bool +void xfs_buf_item_put( struct xfs_buf_log_item *bip) { - struct xfs_log_item *lip = &bip->bli_item; - bool aborted; - bool dirty; + + ASSERT(xfs_buf_islocked(bip->bli_buf)); /* drop the bli ref and return if it wasn't the last one */ if (!atomic_dec_and_test(&bip->bli_refcount)) - return false; + return; - /* - * We dropped the last ref and must free the item if clean or aborted. - * If the bli is dirty and non-aborted, the buffer was clean in the - * transaction but still awaiting writeback from previous changes. In - * that case, the bli is freed on buffer writeback completion. - */ - aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) || - xlog_is_shutdown(lip->li_log); - dirty = bip->bli_flags & XFS_BLI_DIRTY; - if (dirty && !aborted) - return false; + /* If the BLI is in the AIL, then it is still dirty and in use */ + if (test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)) { + ASSERT(bip->bli_flags & XFS_BLI_DIRTY); + return; + } /* - * The bli is aborted or clean. An aborted item may be in the AIL - * regardless of dirty state. For example, consider an aborted - * transaction that invalidated a dirty bli and cleared the dirty - * state. + * In shutdown conditions, we can be asked to free a dirty BLI that + * isn't in the AIL. This can occur due to a checkpoint aborting a BLI + * instead of inserting it into the AIL at checkpoint IO completion. If + * there's another bli reference (e.g. a btree cursor holds a clean + * reference) and it is released via xfs_trans_brelse(), we can get here + * with that aborted, dirty BLI. In this case, it is safe to free the + * dirty BLI immediately, as it is not in the AIL and there are no + * other references to it. + * + * We should never get here with a stale BLI via that path as + * xfs_trans_brelse() specifically holds onto stale buffers rather than + * releasing them. */ - if (aborted) - xfs_trans_ail_delete(lip, 0); - xfs_buf_item_relse(bip->bli_buf); - return true; + ASSERT(!(bip->bli_flags & XFS_BLI_DIRTY) || + test_bit(XFS_LI_ABORTED, &bip->bli_item.li_flags)); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + xfs_buf_item_relse(bip); } /* @@ -600,6 +668,15 @@ xfs_buf_item_put( * if necessary but do not unlock the buffer. This is for support of * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't * free the item. + * + * If the XFS_BLI_STALE flag is set, the last reference to the BLI *must* + * perform a completion abort of any objects attached to the buffer for IO + * tracking purposes. This generally only happens in shutdown situations, + * normally xfs_buf_item_unpin() will drop the last BLI reference and perform + * completion processing. However, because transaction completion can race with + * checkpoint completion during a shutdown, this release context may end up + * being the last active reference to the BLI and so needs to perform this + * cleanup. */ STATIC void xfs_buf_item_release( @@ -607,18 +684,19 @@ xfs_buf_item_release( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - bool released; bool hold = bip->bli_flags & XFS_BLI_HOLD; bool stale = bip->bli_flags & XFS_BLI_STALE; -#if defined(DEBUG) || defined(XFS_WARN) - bool ordered = bip->bli_flags & XFS_BLI_ORDERED; - bool dirty = bip->bli_flags & XFS_BLI_DIRTY; bool aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags); + bool dirty = bip->bli_flags & XFS_BLI_DIRTY; +#if defined(DEBUG) || defined(XFS_WARN) + bool ordered = bip->bli_flags & XFS_BLI_ORDERED; #endif trace_xfs_buf_item_release(bip); + ASSERT(xfs_buf_islocked(bp)); + /* * The bli dirty state should match whether the blf has logged segments * except for ordered buffers, where only the bli should be dirty. @@ -634,16 +712,56 @@ xfs_buf_item_release( bp->b_transp = NULL; bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); + /* If there are other references, then we have nothing to do. */ + if (!atomic_dec_and_test(&bip->bli_refcount)) + goto out_release; + + /* + * Stale buffer completion frees the BLI, unlocks and releases the + * buffer. Neither the BLI or buffer are safe to reference after this + * call, so there's nothing more we need to do here. + * + * If we get here with a stale buffer and references to the BLI remain, + * we must not unlock the buffer as the last BLI reference owns lock + * context, not us. + */ + if (stale) { + xfs_buf_item_finish_stale(bip); + xfs_buf_relse(bp); + ASSERT(!hold); + return; + } + /* - * Unref the item and unlock the buffer unless held or stale. Stale - * buffers remain locked until final unpin unless the bli is freed by - * the unref call. The latter implies shutdown because buffer - * invalidation dirties the bli and transaction. + * Dirty or clean, aborted items are done and need to be removed from + * the AIL and released. This frees the BLI, but leaves the buffer + * locked and referenced. */ - released = xfs_buf_item_put(bip); - if (hold || (stale && !released)) + if (aborted || xlog_is_shutdown(lip->li_log)) { + ASSERT(list_empty(&bip->bli_buf->b_li_list)); + xfs_buf_item_done(bp); + goto out_release; + } + + /* + * Clean, unreferenced BLIs can be immediately freed, leaving the buffer + * locked and referenced. + * + * Dirty, unreferenced BLIs *must* be in the AIL awaiting writeback. + */ + if (!dirty) + xfs_buf_item_relse(bip); + else + ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags)); + + /* Not safe to reference the BLI from here */ +out_release: + /* + * If we get here with a stale buffer, we must not unlock the + * buffer as the last BLI reference owns lock context, not us. + */ + if (stale || hold) return; - ASSERT(!stale || aborted); xfs_buf_relse(bp); } @@ -729,33 +847,6 @@ static const struct xfs_item_ops xfs_buf_item_ops = { .iop_push = xfs_buf_item_push, }; -STATIC void -xfs_buf_item_get_format( - struct xfs_buf_log_item *bip, - int count) -{ - ASSERT(bip->bli_formats == NULL); - bip->bli_format_count = count; - - if (count == 1) { - bip->bli_formats = &bip->__bli_format; - return; - } - - bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format), - GFP_KERNEL | __GFP_NOFAIL); -} - -STATIC void -xfs_buf_item_free_format( - struct xfs_buf_log_item *bip) -{ - if (bip->bli_formats != &bip->__bli_format) { - kfree(bip->bli_formats); - bip->bli_formats = NULL; - } -} - /* * Allocate a new buf log item to go with the given buffer. * Set the buffer's b_log_item field to point to the new @@ -976,34 +1067,6 @@ xfs_buf_item_dirty_format( return false; } -STATIC void -xfs_buf_item_free( - struct xfs_buf_log_item *bip) -{ - xfs_buf_item_free_format(bip); - kvfree(bip->bli_item.li_lv_shadow); - kmem_cache_free(xfs_buf_item_cache, bip); -} - -/* - * xfs_buf_item_relse() is called when the buf log item is no longer needed. - */ -void -xfs_buf_item_relse( - struct xfs_buf *bp) -{ - struct xfs_buf_log_item *bip = bp->b_log_item; - - trace_xfs_buf_item_relse(bp, _RET_IP_); - ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); - - if (atomic_read(&bip->bli_refcount)) - return; - bp->b_log_item = NULL; - xfs_buf_rele(bp); - xfs_buf_item_free(bip); -} - void xfs_buf_item_done( struct xfs_buf *bp) @@ -1023,5 +1086,5 @@ xfs_buf_item_done( xfs_trans_ail_delete(&bp->b_log_item->bli_item, (bp->b_flags & _XBF_LOGRECOVERY) ? 0 : SHUTDOWN_CORRUPT_INCORE); - xfs_buf_item_relse(bp); + xfs_buf_item_relse(bp->b_log_item); } diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index e10e324cd245..3159325dd17b 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -49,8 +49,7 @@ struct xfs_buf_log_item { int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); void xfs_buf_item_done(struct xfs_buf *bp); -void xfs_buf_item_relse(struct xfs_buf *); -bool xfs_buf_item_put(struct xfs_buf_log_item *); +void xfs_buf_item_put(struct xfs_buf_log_item *bip); void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint); bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); void xfs_buf_inode_iodone(struct xfs_buf *); @@ -62,7 +61,7 @@ static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp) } #endif /* CONFIG_XFS_QUOTA */ void xfs_buf_iodone(struct xfs_buf *); -bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); +bool xfs_buf_log_check_iovec(struct kvec *iovec); unsigned int xfs_buf_inval_log_space(unsigned int map_count, unsigned int blocksize); diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index d4c5cef5bc43..5d58e2ae4972 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -159,7 +159,7 @@ STATIC enum xlog_recover_reorder xlog_recover_buf_reorder( struct xlog_recover_item *item) { - struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + struct xfs_buf_log_format *buf_f = item->ri_buf[0].iov_base; if (buf_f->blf_flags & XFS_BLF_CANCEL) return XLOG_REORDER_CANCEL_LIST; @@ -173,7 +173,7 @@ xlog_recover_buf_ra_pass2( struct xlog *log, struct xlog_recover_item *item) { - struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + struct xfs_buf_log_format *buf_f = item->ri_buf[0].iov_base; xlog_buf_readahead(log, buf_f->blf_blkno, buf_f->blf_len, NULL); } @@ -187,11 +187,11 @@ xlog_recover_buf_commit_pass1( struct xlog *log, struct xlog_recover_item *item) { - struct xfs_buf_log_format *bf = item->ri_buf[0].i_addr; + struct xfs_buf_log_format *bf = item->ri_buf[0].iov_base; if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) { - xfs_err(log->l_mp, "bad buffer log item size (%d)", - item->ri_buf[0].i_len); + xfs_err(log->l_mp, "bad buffer log item size (%zd)", + item->ri_buf[0].iov_len); return -EFSCORRUPTED; } @@ -487,8 +487,8 @@ xlog_recover_do_reg_buffer( nbits = xfs_contig_bits(buf_f->blf_data_map, buf_f->blf_map_size, bit); ASSERT(nbits > 0); - ASSERT(item->ri_buf[i].i_addr != NULL); - ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); + ASSERT(item->ri_buf[i].iov_base != NULL); + ASSERT(item->ri_buf[i].iov_len % XFS_BLF_CHUNK == 0); ASSERT(BBTOB(bp->b_length) >= ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); @@ -500,8 +500,8 @@ xlog_recover_do_reg_buffer( * the log. Hence we need to trim nbits back to the length of * the current region being copied out of the log. */ - if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) - nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; + if (item->ri_buf[i].iov_len < (nbits << XFS_BLF_SHIFT)) + nbits = item->ri_buf[i].iov_len >> XFS_BLF_SHIFT; /* * Do a sanity check if this is a dquot buffer. Just checking @@ -511,18 +511,18 @@ xlog_recover_do_reg_buffer( fa = NULL; if (buf_f->blf_flags & (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { - if (item->ri_buf[i].i_addr == NULL) { + if (item->ri_buf[i].iov_base == NULL) { xfs_alert(mp, "XFS: NULL dquot in %s.", __func__); goto next; } - if (item->ri_buf[i].i_len < size_disk_dquot) { + if (item->ri_buf[i].iov_len < size_disk_dquot) { xfs_alert(mp, - "XFS: dquot too small (%d) in %s.", - item->ri_buf[i].i_len, __func__); + "XFS: dquot too small (%zd) in %s.", + item->ri_buf[i].iov_len, __func__); goto next; } - fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, -1); + fa = xfs_dquot_verify(mp, item->ri_buf[i].iov_base, -1); if (fa) { xfs_alert(mp, "dquot corrupt at %pS trying to replay into block 0x%llx", @@ -533,7 +533,7 @@ xlog_recover_do_reg_buffer( memcpy(xfs_buf_offset(bp, (uint)bit << XFS_BLF_SHIFT), /* dest */ - item->ri_buf[i].i_addr, /* source */ + item->ri_buf[i].iov_base, /* source */ nbits<<XFS_BLF_SHIFT); /* length */ next: i++; @@ -669,8 +669,8 @@ xlog_recover_do_inode_buffer( if (next_unlinked_offset < reg_buf_offset) continue; - ASSERT(item->ri_buf[item_index].i_addr != NULL); - ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); + ASSERT(item->ri_buf[item_index].iov_base != NULL); + ASSERT((item->ri_buf[item_index].iov_len % XFS_BLF_CHUNK) == 0); ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length)); /* @@ -678,7 +678,7 @@ xlog_recover_do_inode_buffer( * current di_next_unlinked field. Extract its value * and copy it to the buffer copy. */ - logged_nextp = item->ri_buf[item_index].i_addr + + logged_nextp = item->ri_buf[item_index].iov_base + next_unlinked_offset - reg_buf_offset; if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) { xfs_alert(mp, @@ -1002,7 +1002,7 @@ xlog_recover_buf_commit_pass2( struct xlog_recover_item *item, xfs_lsn_t current_lsn) { - struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + struct xfs_buf_log_format *buf_f = item->ri_buf[0].iov_base; struct xfs_mount *mp = log->l_mp; struct xfs_buf *bp; int error; diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 94d0873bcd62..ee49f20875af 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -103,24 +103,6 @@ xfs_discard_endio( bio_put(bio); } -static inline struct block_device * -xfs_group_bdev( - const struct xfs_group *xg) -{ - struct xfs_mount *mp = xg->xg_mount; - - switch (xg->xg_type) { - case XG_TYPE_AG: - return mp->m_ddev_targp->bt_bdev; - case XG_TYPE_RTG: - return mp->m_rtdev_targp->bt_bdev; - default: - ASSERT(0); - break; - } - return NULL; -} - /* * Walk the discard list and issue discards on all the busy extents in the * list. We plug and chain the bios so that we only need a single completion @@ -138,11 +120,14 @@ xfs_discard_extents( blk_start_plug(&plug); list_for_each_entry(busyp, &extents->extent_list, list) { - trace_xfs_discard_extent(busyp->group, busyp->bno, - busyp->length); + struct xfs_group *xg = busyp->group; + struct xfs_buftarg *btp = + xfs_group_type_buftarg(xg->xg_mount, xg->xg_type); - error = __blkdev_issue_discard(xfs_group_bdev(busyp->group), - xfs_gbno_to_daddr(busyp->group, busyp->bno), + trace_xfs_discard_extent(xg, busyp->bno, busyp->length); + + error = __blkdev_issue_discard(btp->bt_bdev, + xfs_gbno_to_daddr(xg, busyp->bno), XFS_FSB_TO_BB(mp, busyp->length), GFP_KERNEL, &bio); if (error && error != -EOPNOTSUPP) { @@ -204,9 +189,7 @@ xfs_trim_gather_extents( */ xfs_log_force(mp, XFS_LOG_SYNC); - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - return error; + tp = xfs_trans_alloc_empty(mp); error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) @@ -598,9 +581,7 @@ xfs_trim_rtextents( struct xfs_trans *tp; int error; - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - return error; + tp = xfs_trans_alloc_empty(mp); /* * Walk the free ranges between low and high. The query_range function @@ -716,9 +697,7 @@ xfs_trim_rtgroup_extents( struct xfs_trans *tp; int error; - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - return error; + tp = xfs_trans_alloc_empty(mp); /* * Walk the free ranges between low and high. The query_range function diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index b4e32f0860b7..0bd8022e47b4 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1398,11 +1398,9 @@ xfs_qm_dqflush( ASSERT(XFS_DQ_IS_LOCKED(dqp)); ASSERT(!completion_done(&dqp->q_flush)); + ASSERT(atomic_read(&dqp->q_pincount) == 0); trace_xfs_dqflush(dqp); - - xfs_qm_dqunpin_wait(dqp); - fa = xfs_qm_dqflush_check(dqp); if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c index 2c2720ce6923..89bc9bcaf51e 100644 --- a/fs/xfs/xfs_dquot_item_recover.c +++ b/fs/xfs/xfs_dquot_item_recover.c @@ -34,10 +34,10 @@ xlog_recover_dquot_ra_pass2( if (mp->m_qflags == 0) return; - recddq = item->ri_buf[1].i_addr; + recddq = item->ri_buf[1].iov_base; if (recddq == NULL) return; - if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) + if (item->ri_buf[1].iov_len < sizeof(struct xfs_disk_dquot)) return; type = recddq->d_type & XFS_DQTYPE_REC_MASK; @@ -45,7 +45,7 @@ xlog_recover_dquot_ra_pass2( if (log->l_quotaoffs_flag & type) return; - dq_f = item->ri_buf[0].i_addr; + dq_f = item->ri_buf[0].iov_base; ASSERT(dq_f); ASSERT(dq_f->qlf_len == 1); @@ -79,14 +79,14 @@ xlog_recover_dquot_commit_pass2( if (mp->m_qflags == 0) return 0; - recddq = item->ri_buf[1].i_addr; + recddq = item->ri_buf[1].iov_base; if (recddq == NULL) { xfs_alert(log->l_mp, "NULL dquot in %s.", __func__); return -EFSCORRUPTED; } - if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) { - xfs_alert(log->l_mp, "dquot too small (%d) in %s.", - item->ri_buf[1].i_len, __func__); + if (item->ri_buf[1].iov_len < sizeof(struct xfs_disk_dquot)) { + xfs_alert(log->l_mp, "dquot too small (%zd) in %s.", + item->ri_buf[1].iov_len, __func__); return -EFSCORRUPTED; } @@ -108,7 +108,7 @@ xlog_recover_dquot_commit_pass2( * The other possibility, of course, is that the quota subsystem was * removed since the last mount - ENOSYS. */ - dq_f = item->ri_buf[0].i_addr; + dq_f = item->ri_buf[0].iov_base; ASSERT(dq_f); fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id); if (fa) { @@ -147,7 +147,7 @@ xlog_recover_dquot_commit_pass2( } } - memcpy(ddq, recddq, item->ri_buf[1].i_len); + memcpy(ddq, recddq, item->ri_buf[1].iov_len); if (xfs_has_crc(mp)) { xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); @@ -192,7 +192,7 @@ xlog_recover_quotaoff_commit_pass1( struct xlog *log, struct xlog_recover_item *item) { - struct xfs_qoff_logformat *qoff_f = item->ri_buf[0].i_addr; + struct xfs_qoff_logformat *qoff_f = item->ri_buf[0].iov_base; ASSERT(qoff_f); /* diff --git a/fs/xfs/xfs_exchmaps_item.c b/fs/xfs/xfs_exchmaps_item.c index 264a121c5e16..229cbe0adf17 100644 --- a/fs/xfs/xfs_exchmaps_item.c +++ b/fs/xfs/xfs_exchmaps_item.c @@ -558,12 +558,12 @@ xlog_recover_xmi_commit_pass2( size_t len; len = sizeof(struct xfs_xmi_log_format); - if (item->ri_buf[0].i_len != len) { + if (item->ri_buf[0].iov_len != len) { XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); return -EFSCORRUPTED; } - xmi_formatp = item->ri_buf[0].i_addr; + xmi_formatp = item->ri_buf[0].iov_base; if (xmi_formatp->__pad != 0) { XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); return -EFSCORRUPTED; @@ -598,8 +598,8 @@ xlog_recover_xmd_commit_pass2( { struct xfs_xmd_log_format *xmd_formatp; - xmd_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_xmd_log_format)) { + xmd_formatp = item->ri_buf[0].iov_base; + if (item->ri_buf[0].iov_len != sizeof(struct xfs_xmd_log_format)) { XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index f069b04e8ea1..3e6e019b6146 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -68,4 +68,12 @@ static inline void xfs_extent_busy_sort(struct list_head *list) list_sort(NULL, list, xfs_extent_busy_ag_cmp); } +/* + * Zoned RTGs don't need to track busy extents, as the actual block freeing only + * happens by a zone reset, which forces out all transactions that touched the + * to be reset zone first. + */ +#define xfs_group_has_extent_busy(mp, type) \ + ((type) == XG_TYPE_AG || !xfs_has_zoned((mp))) + #endif /* __XFS_EXTENT_BUSY_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index d574f5f639fa..47ee598a9827 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -182,15 +182,18 @@ xfs_efi_init( * It will handle the conversion of formats if necessary. */ STATIC int -xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) +xfs_efi_copy_format( + struct kvec *buf, + struct xfs_efi_log_format *dst_efi_fmt) { - xfs_efi_log_format_t *src_efi_fmt = buf->i_addr; - uint i; - uint len = xfs_efi_log_format_sizeof(src_efi_fmt->efi_nextents); - uint len32 = xfs_efi_log_format32_sizeof(src_efi_fmt->efi_nextents); - uint len64 = xfs_efi_log_format64_sizeof(src_efi_fmt->efi_nextents); + struct xfs_efi_log_format *src_efi_fmt = buf->iov_base; + uint len, len32, len64, i; - if (buf->i_len == len) { + len = xfs_efi_log_format_sizeof(src_efi_fmt->efi_nextents); + len32 = xfs_efi_log_format32_sizeof(src_efi_fmt->efi_nextents); + len64 = xfs_efi_log_format64_sizeof(src_efi_fmt->efi_nextents); + + if (buf->iov_len == len) { memcpy(dst_efi_fmt, src_efi_fmt, offsetof(struct xfs_efi_log_format, efi_extents)); for (i = 0; i < src_efi_fmt->efi_nextents; i++) @@ -198,8 +201,8 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) &src_efi_fmt->efi_extents[i], sizeof(struct xfs_extent)); return 0; - } else if (buf->i_len == len32) { - xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr; + } else if (buf->iov_len == len32) { + xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->iov_base; dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type; dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size; @@ -212,8 +215,8 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) src_efi_fmt_32->efi_extents[i].ext_len; } return 0; - } else if (buf->i_len == len64) { - xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->i_addr; + } else if (buf->iov_len == len64) { + xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->iov_base; dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type; dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size; @@ -227,8 +230,8 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) } return 0; } - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, NULL, buf->i_addr, - buf->i_len); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, NULL, buf->iov_base, + buf->iov_len); return -EFSCORRUPTED; } @@ -865,11 +868,11 @@ xlog_recover_efi_commit_pass2( struct xfs_efi_log_format *efi_formatp; int error; - efi_formatp = item->ri_buf[0].i_addr; + efi_formatp = item->ri_buf[0].iov_base; - if (item->ri_buf[0].i_len < xfs_efi_log_format_sizeof(0)) { + if (item->ri_buf[0].iov_len < xfs_efi_log_format_sizeof(0)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } @@ -904,11 +907,11 @@ xlog_recover_rtefi_commit_pass2( struct xfs_efi_log_format *efi_formatp; int error; - efi_formatp = item->ri_buf[0].i_addr; + efi_formatp = item->ri_buf[0].iov_base; - if (item->ri_buf[0].i_len < xfs_efi_log_format_sizeof(0)) { + if (item->ri_buf[0].iov_len < xfs_efi_log_format_sizeof(0)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } @@ -933,7 +936,7 @@ xlog_recover_rtefi_commit_pass2( xfs_lsn_t lsn) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } #endif @@ -958,9 +961,9 @@ xlog_recover_efd_commit_pass2( xfs_lsn_t lsn) { struct xfs_efd_log_format *efd_formatp; - int buflen = item->ri_buf[0].i_len; + int buflen = item->ri_buf[0].iov_len; - efd_formatp = item->ri_buf[0].i_addr; + efd_formatp = item->ri_buf[0].iov_base; if (buflen < sizeof(struct xfs_efd_log_format)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, @@ -968,9 +971,9 @@ xlog_recover_efd_commit_pass2( return -EFSCORRUPTED; } - if (item->ri_buf[0].i_len != xfs_efd_log_format32_sizeof( + if (item->ri_buf[0].iov_len != xfs_efd_log_format32_sizeof( efd_formatp->efd_nextents) && - item->ri_buf[0].i_len != xfs_efd_log_format64_sizeof( + item->ri_buf[0].iov_len != xfs_efd_log_format64_sizeof( efd_formatp->efd_nextents)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, efd_formatp, buflen); @@ -995,9 +998,9 @@ xlog_recover_rtefd_commit_pass2( xfs_lsn_t lsn) { struct xfs_efd_log_format *efd_formatp; - int buflen = item->ri_buf[0].i_len; + int buflen = item->ri_buf[0].iov_len; - efd_formatp = item->ri_buf[0].i_addr; + efd_formatp = item->ri_buf[0].iov_base; if (buflen < sizeof(struct xfs_efd_log_format)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, @@ -1005,9 +1008,9 @@ xlog_recover_rtefd_commit_pass2( return -EFSCORRUPTED; } - if (item->ri_buf[0].i_len != xfs_efd_log_format32_sizeof( + if (item->ri_buf[0].iov_len != xfs_efd_log_format32_sizeof( efd_formatp->efd_nextents) && - item->ri_buf[0].i_len != xfs_efd_log_format64_sizeof( + item->ri_buf[0].iov_len != xfs_efd_log_format64_sizeof( efd_formatp->efd_nextents)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, efd_formatp, buflen); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 48254a72071b..55a304cb3aef 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -497,7 +497,7 @@ restart: static ssize_t xfs_zoned_write_space_reserve( - struct xfs_inode *ip, + struct xfs_mount *mp, struct kiocb *iocb, struct iov_iter *from, unsigned int flags, @@ -533,8 +533,8 @@ xfs_zoned_write_space_reserve( * * Any remaining block will be returned after the write. */ - return xfs_zoned_space_reserve(ip, - XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac); + return xfs_zoned_space_reserve(mp, XFS_B_TO_FSB(mp, count) + 1 + 2, + flags, ac); } static int @@ -718,13 +718,13 @@ xfs_file_dio_write_zoned( struct xfs_zone_alloc_ctx ac = { }; ssize_t ret; - ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac); + ret = xfs_zoned_write_space_reserve(ip->i_mount, iocb, from, 0, &ac); if (ret < 0) return ret; ret = xfs_file_dio_write_aligned(ip, iocb, from, &xfs_zoned_direct_write_iomap_ops, &xfs_dio_zoned_write_ops, &ac); - xfs_zoned_space_unreserve(ip, &ac); + xfs_zoned_space_unreserve(ip->i_mount, &ac); return ret; } @@ -752,7 +752,7 @@ xfs_file_dio_write_atomic( * HW offload should be faster, so try that first if it is already * known that the write length is not too large. */ - if (ocount > xfs_inode_buftarg(ip)->bt_bdev_awu_max) + if (ocount > xfs_inode_buftarg(ip)->bt_awu_max) dops = &xfs_atomic_write_cow_iomap_ops; else dops = &xfs_direct_write_iomap_ops; @@ -979,7 +979,8 @@ write_retry: trace_xfs_file_buffered_write(iocb, from); ret = iomap_file_buffered_write(iocb, from, - &xfs_buffered_write_iomap_ops, NULL); + &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, + NULL); /* * If we hit a space limit, try to free up some lingering preallocated @@ -1032,7 +1033,7 @@ xfs_file_buffered_write_zoned( struct xfs_zone_alloc_ctx ac = { }; ssize_t ret; - ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac); + ret = xfs_zoned_write_space_reserve(mp, iocb, from, XFS_ZR_GREEDY, &ac); if (ret < 0) return ret; @@ -1059,7 +1060,8 @@ xfs_file_buffered_write_zoned( retry: trace_xfs_file_buffered_write(iocb, from); ret = iomap_file_buffered_write(iocb, from, - &xfs_buffered_write_iomap_ops, &ac); + &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, + &ac); if (ret == -ENOSPC && !cleared_space) { /* * Kick off writeback to convert delalloc space and release the @@ -1073,7 +1075,7 @@ retry: out_unlock: xfs_iunlock(ip, iolock); out_unreserve: - xfs_zoned_space_unreserve(ip, &ac); + xfs_zoned_space_unreserve(ip->i_mount, &ac); if (ret > 0) { XFS_STATS_ADD(mp, xs_write_bytes, ret); ret = generic_write_sync(iocb, ret); @@ -1335,9 +1337,10 @@ xfs_falloc_allocate_range( } #define XFS_FALLOC_FL_SUPPORTED \ - (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ - FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ - FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE) + (FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \ + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \ + FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \ + FALLOC_FL_UNSHARE_RANGE) STATIC long __xfs_file_fallocate( @@ -1413,11 +1416,11 @@ xfs_file_zoned_fallocate( struct xfs_inode *ip = XFS_I(file_inode(file)); int error; - error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac); + error = xfs_zoned_space_reserve(ip->i_mount, 2, XFS_ZR_RESERVED, &ac); if (error) return error; error = __xfs_file_fallocate(file, mode, offset, len, &ac); - xfs_zoned_space_unreserve(ip, &ac); + xfs_zoned_space_unreserve(ip->i_mount, &ac); return error; } @@ -1729,7 +1732,7 @@ xfs_dax_fault_locked( bool write_fault) { vm_fault_t ret; - pfn_t pfn; + unsigned long pfn; if (!IS_ENABLED(CONFIG_FS_DAX)) { ASSERT(0); @@ -1827,12 +1830,12 @@ xfs_write_fault_zoned( * But as the overallocation is limited to less than a folio and will be * release instantly that's just fine. */ - error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0, - &ac); + error = xfs_zoned_space_reserve(ip->i_mount, + XFS_B_TO_FSB(ip->i_mount, len), 0, &ac); if (error < 0) return vmf_fs_error(error); ret = __xfs_write_fault(vmf, order, &ac); - xfs_zoned_space_unreserve(ip, &ac); + xfs_zoned_space_unreserve(ip->i_mount, &ac); return ret; } @@ -1913,10 +1916,10 @@ static const struct vm_operations_struct xfs_file_vm_ops = { }; STATIC int -xfs_file_mmap( - struct file *file, - struct vm_area_struct *vma) +xfs_file_mmap_prepare( + struct vm_area_desc *desc) { + struct file *file = desc->file; struct inode *inode = file_inode(file); struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode)); @@ -1924,13 +1927,14 @@ xfs_file_mmap( * We don't support synchronous mappings for non-DAX files and * for DAX files if underneath dax_device is not synchronous. */ - if (!daxdev_mapping_supported(vma, target->bt_daxdev)) + if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), + target->bt_daxdev)) return -EOPNOTSUPP; file_accessed(file); - vma->vm_ops = &xfs_file_vm_ops; + desc->vm_ops = &xfs_file_vm_ops; if (IS_DAX(inode)) - vm_flags_set(vma, VM_HUGEPAGE); + desc->vm_flags |= VM_HUGEPAGE; return 0; } @@ -1945,7 +1949,7 @@ const struct file_operations xfs_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, #endif - .mmap = xfs_file_mmap, + .mmap_prepare = xfs_file_mmap_prepare, .open = xfs_file_open, .release = xfs_file_release, .fsync = xfs_file_fsync, diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 414b27a86458..af68c7de8ee8 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -1270,9 +1270,7 @@ xfs_getfsmap( * buffer locking abilities to detect cycles in the rmapbt * without deadlocking. */ - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - break; + tp = xfs_trans_alloc_empty(mp); info.dev = handlers[i].dev; info.last = false; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 726e29b837e6..4cf7abe50143 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -893,10 +893,7 @@ xfs_metafile_iget( struct xfs_trans *tp; int error; - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - return error; - + tp = xfs_trans_alloc_empty(mp); error = xfs_trans_metafile_iget(tp, ino, metafile_type, ipp); xfs_trans_cancel(tp); return error; @@ -979,7 +976,15 @@ xfs_reclaim_inode( */ if (xlog_is_shutdown(ip->i_mount->m_log)) { xfs_iunpin_wait(ip); + /* + * Avoid a ABBA deadlock on the inode cluster buffer vs + * concurrent xfs_ifree_cluster() trying to mark the inode + * stale. We don't need the inode locked to run the flush abort + * code, but the flush abort needs to lock the cluster buffer. + */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_iflush_shutdown_abort(ip); + xfs_ilock(ip, XFS_ILOCK_EXCL); goto reclaim; } if (xfs_ipincount(ip)) diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 4345db501714..f83ec2bd0583 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -158,7 +158,7 @@ xlog_recover_icreate_commit_pass2( int nbufs; int i; - icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; + icl = (struct xfs_icreate_log *)item->ri_buf[0].iov_base; if (icl->icl_type != XFS_LI_ICREATE) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); return -EINVAL; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ee3e0f284287..9c39251961a3 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1635,7 +1635,7 @@ retry: iip = ip->i_itemp; if (__xfs_iflags_test(ip, XFS_IFLUSHING)) { ASSERT(!list_empty(&iip->ili_item.li_bio_list)); - ASSERT(iip->ili_last_fields); + ASSERT(iip->ili_last_fields || xlog_is_shutdown(mp->m_log)); goto out_iunlock; } @@ -2932,12 +2932,9 @@ xfs_inode_reload_unlinked( struct xfs_inode *ip) { struct xfs_trans *tp; - int error; - - error = xfs_trans_alloc_empty(ip->i_mount, &tp); - if (error) - return error; + int error = 0; + tp = xfs_trans_alloc_empty(ip->i_mount); xfs_ilock(ip, XFS_ILOCK_SHARED); if (xfs_inode_unlinked_incomplete(ip)) error = xfs_inode_reload_unlinked_bucket(tp, ip); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index d7e2b902ef5c..07fbdcc4cbf5 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -358,7 +358,7 @@ static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip) static inline bool xfs_inode_can_hw_atomic_write(const struct xfs_inode *ip) { - return xfs_inode_buftarg(ip)->bt_bdev_awu_max > 0; + return xfs_inode_buftarg(ip)->bt_awu_max > 0; } /* diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index c6cb0b6b9e46..829675700fcd 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -758,11 +758,14 @@ xfs_inode_item_push( * completed and items removed from the AIL before the next push * attempt. */ + trace_xfs_inode_push_stale(ip, _RET_IP_); return XFS_ITEM_PINNED; } - if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp)) + if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp)) { + trace_xfs_inode_push_pinned(ip, _RET_IP_); return XFS_ITEM_PINNED; + } if (xfs_iflags_test(ip, XFS_IFLUSHING)) return XFS_ITEM_FLUSHING; @@ -1179,12 +1182,12 @@ xfs_iflush_shutdown_abort( */ int xfs_inode_item_format_convert( - struct xfs_log_iovec *buf, + struct kvec *buf, struct xfs_inode_log_format *in_f) { - struct xfs_inode_log_format_32 *in_f32 = buf->i_addr; + struct xfs_inode_log_format_32 *in_f32 = buf->iov_base; - if (buf->i_len != sizeof(*in_f32)) { + if (buf->iov_len != sizeof(*in_f32)) { XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 377e06007804..ba92ce11a011 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -46,8 +46,8 @@ extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); extern void xfs_iflush_abort(struct xfs_inode *); extern void xfs_iflush_shutdown_abort(struct xfs_inode *); -extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, - struct xfs_inode_log_format *); +int xfs_inode_item_format_convert(struct kvec *buf, + struct xfs_inode_log_format *in_f); extern struct kmem_cache *xfs_ili_cache; diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 7205fd14f6b3..9d1999d41be1 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -30,13 +30,13 @@ xlog_recover_inode_ra_pass2( struct xlog *log, struct xlog_recover_item *item) { - if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { - struct xfs_inode_log_format *ilfp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].iov_len == sizeof(struct xfs_inode_log_format)) { + struct xfs_inode_log_format *ilfp = item->ri_buf[0].iov_base; xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len, &xfs_inode_buf_ra_ops); } else { - struct xfs_inode_log_format_32 *ilfp = item->ri_buf[0].i_addr; + struct xfs_inode_log_format_32 *ilfp = item->ri_buf[0].iov_base; xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len, &xfs_inode_buf_ra_ops); @@ -326,8 +326,8 @@ xlog_recover_inode_commit_pass2( int need_free = 0; xfs_failaddr_t fa; - if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { - in_f = item->ri_buf[0].i_addr; + if (item->ri_buf[0].iov_len == sizeof(struct xfs_inode_log_format)) { + in_f = item->ri_buf[0].iov_base; } else { in_f = kmalloc(sizeof(struct xfs_inode_log_format), GFP_KERNEL | __GFP_NOFAIL); @@ -366,7 +366,7 @@ xlog_recover_inode_commit_pass2( error = -EFSCORRUPTED; goto out_release; } - ldip = item->ri_buf[1].i_addr; + ldip = item->ri_buf[1].iov_base; if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) { xfs_alert(mp, "%s: Bad inode log record, rec ptr "PTR_FMT", ino %lld", @@ -472,12 +472,12 @@ xlog_recover_inode_commit_pass2( goto out_release; } isize = xfs_log_dinode_size(mp); - if (unlikely(item->ri_buf[1].i_len > isize)) { + if (unlikely(item->ri_buf[1].iov_len > isize)) { XFS_CORRUPTION_ERROR("Bad log dinode size", XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); xfs_alert(mp, - "Bad inode 0x%llx log dinode size 0x%x", - in_f->ilf_ino, item->ri_buf[1].i_len); + "Bad inode 0x%llx log dinode size 0x%zx", + in_f->ilf_ino, item->ri_buf[1].iov_len); error = -EFSCORRUPTED; goto out_release; } @@ -500,8 +500,8 @@ xlog_recover_inode_commit_pass2( if (in_f->ilf_size == 2) goto out_owner_change; - len = item->ri_buf[2].i_len; - src = item->ri_buf[2].i_addr; + len = item->ri_buf[2].iov_len; + src = item->ri_buf[2].iov_base; ASSERT(in_f->ilf_size <= 4); ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); ASSERT(!(fields & XFS_ILOG_DFORK) || @@ -538,8 +538,8 @@ xlog_recover_inode_commit_pass2( } else { attr_index = 2; } - len = item->ri_buf[attr_index].i_len; - src = item->ri_buf[attr_index].i_addr; + len = item->ri_buf[attr_index].iov_len; + src = item->ri_buf[attr_index].iov_base; ASSERT(len == xlog_calc_iovec_len(in_f->ilf_asize)); switch (in_f->ilf_fields & XFS_ILOG_AFORK) { diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index d250f7f74e3b..fe1f74a3b6a3 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -444,7 +444,7 @@ static void xfs_fill_fsxattr( struct xfs_inode *ip, int whichfork, - struct fileattr *fa) + struct file_kattr *fa) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); @@ -496,7 +496,7 @@ xfs_ioc_fsgetxattra( xfs_inode_t *ip, void __user *arg) { - struct fileattr fa; + struct file_kattr fa; xfs_ilock(ip, XFS_ILOCK_SHARED); xfs_fill_fsxattr(ip, XFS_ATTR_FORK, &fa); @@ -508,7 +508,7 @@ xfs_ioc_fsgetxattra( int xfs_fileattr_get( struct dentry *dentry, - struct fileattr *fa) + struct file_kattr *fa) { struct xfs_inode *ip = XFS_I(d_inode(dentry)); @@ -526,7 +526,7 @@ static int xfs_ioctl_setattr_xflags( struct xfs_trans *tp, struct xfs_inode *ip, - struct fileattr *fa) + struct file_kattr *fa) { struct xfs_mount *mp = ip->i_mount; bool rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME); @@ -582,7 +582,7 @@ xfs_ioctl_setattr_xflags( static void xfs_ioctl_setattr_prepare_dax( struct xfs_inode *ip, - struct fileattr *fa) + struct file_kattr *fa) { struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); @@ -642,7 +642,7 @@ out_error: static int xfs_ioctl_setattr_check_extsize( struct xfs_inode *ip, - struct fileattr *fa) + struct file_kattr *fa) { struct xfs_mount *mp = ip->i_mount; xfs_failaddr_t failaddr; @@ -684,7 +684,7 @@ xfs_ioctl_setattr_check_extsize( static int xfs_ioctl_setattr_check_cowextsize( struct xfs_inode *ip, - struct fileattr *fa) + struct file_kattr *fa) { struct xfs_mount *mp = ip->i_mount; xfs_failaddr_t failaddr; @@ -709,7 +709,7 @@ xfs_ioctl_setattr_check_cowextsize( static int xfs_ioctl_setattr_check_projid( struct xfs_inode *ip, - struct fileattr *fa) + struct file_kattr *fa) { if (!fa->fsx_valid) return 0; @@ -725,7 +725,7 @@ int xfs_fileattr_set( struct mnt_idmap *idmap, struct dentry *dentry, - struct fileattr *fa) + struct file_kattr *fa) { struct xfs_inode *ip = XFS_I(d_inode(dentry)); struct xfs_mount *mp = ip->i_mount; @@ -990,9 +990,8 @@ xfs_ioc_getlabel( BUILD_BUG_ON(sizeof(sbp->sb_fname) > FSLABEL_MAX); /* 1 larger than sb_fname, so this ensures a trailing NUL char */ - memset(label, 0, sizeof(label)); spin_lock(&mp->m_sb_lock); - strncpy(label, sbp->sb_fname, XFSLABEL_MAX); + memtostr_pad(label, sbp->sb_fname); spin_unlock(&mp->m_sb_lock); if (copy_to_user(user_label, label, sizeof(label))) diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 12124946f347..f5ed5cf9d3df 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -17,13 +17,13 @@ xfs_ioc_swapext( extern int xfs_fileattr_get( struct dentry *dentry, - struct fileattr *fa); + struct file_kattr *fa); extern int xfs_fileattr_set( struct mnt_idmap *idmap, struct dentry *dentry, - struct fileattr *fa); + struct file_kattr *fa); extern long xfs_file_ioctl( diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index ff05e6b1b0bb..2a74f2957341 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -79,6 +79,9 @@ xfs_iomap_valid( { struct xfs_inode *ip = XFS_I(inode); + if (iomap->type == IOMAP_HOLE) + return true; + if (iomap->validity_cookie != xfs_iomap_inode_sequence(ip, iomap->flags)) { trace_xfs_iomap_invalid(ip, iomap); @@ -89,7 +92,7 @@ xfs_iomap_valid( return true; } -static const struct iomap_folio_ops xfs_iomap_folio_ops = { +const struct iomap_write_ops xfs_iomap_write_ops = { .iomap_valid = xfs_iomap_valid, }; @@ -151,7 +154,6 @@ xfs_bmbt_to_iomap( iomap->flags |= IOMAP_F_DIRTY; iomap->validity_cookie = sequence_cookie; - iomap->folio_ops = &xfs_iomap_folio_ops; return 0; } @@ -827,7 +829,7 @@ xfs_bmap_hw_atomic_write_possible( /* * The ->iomap_begin caller should ensure this, but check anyway. */ - return len <= xfs_inode_buftarg(ip)->bt_bdev_awu_max; + return len <= xfs_inode_buftarg(ip)->bt_awu_max; } static int @@ -2198,7 +2200,8 @@ xfs_zero_range( return dax_zero_range(inode, pos, len, did_zero, &xfs_dax_write_iomap_ops); return iomap_zero_range(inode, pos, len, did_zero, - &xfs_buffered_write_iomap_ops, ac); + &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, + ac); } int @@ -2214,5 +2217,6 @@ xfs_truncate_page( return dax_truncate_page(inode, pos, did_zero, &xfs_dax_write_iomap_ops); return iomap_truncate_page(inode, pos, did_zero, - &xfs_buffered_write_iomap_ops, ac); + &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, + ac); } diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 674f8ac1b9bd..ebcce7d49446 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -57,5 +57,6 @@ extern const struct iomap_ops xfs_seek_iomap_ops; extern const struct iomap_ops xfs_xattr_iomap_ops; extern const struct iomap_ops xfs_dax_write_iomap_ops; extern const struct iomap_ops xfs_atomic_write_cow_iomap_ops; +extern const struct iomap_write_ops xfs_iomap_write_ops; #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 8cddbb7c149b..149b5460fbfd 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -665,7 +665,7 @@ xfs_get_atomic_write_max_opt( * less than our out of place write limit, but we don't want to exceed * the awu_max. */ - return min(awu_max, xfs_inode_buftarg(ip)->bt_bdev_awu_max); + return min(awu_max, xfs_inode_buftarg(ip)->bt_awu_max); } static void @@ -970,7 +970,7 @@ xfs_setattr_size( * change. */ if (xfs_is_zoned_inode(ip)) { - error = xfs_zoned_space_reserve(ip, 1, + error = xfs_zoned_space_reserve(mp, 1, XFS_ZR_NOWAIT | XFS_ZR_RESERVED, &ac); if (error) { if (error == -EAGAIN) @@ -998,7 +998,7 @@ xfs_setattr_size( } if (xfs_is_zoned_inode(ip)) - xfs_zoned_space_unreserve(ip, &ac); + xfs_zoned_space_unreserve(mp, &ac); if (error) return error; diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 1fa1c0564b0c..c8c9b8d8309f 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -239,14 +239,10 @@ xfs_bulkstat_one( * Grab an empty transaction so that we can use its recursive buffer * locking abilities to detect cycles in the inobt without deadlocking. */ - error = xfs_trans_alloc_empty(breq->mp, &tp); - if (error) - goto out; - + tp = xfs_trans_alloc_empty(breq->mp); error = xfs_bulkstat_one_int(breq->mp, breq->idmap, tp, breq->startino, &bc); xfs_trans_cancel(tp); -out: kfree(bc.buf); /* @@ -331,17 +327,13 @@ xfs_bulkstat( * Grab an empty transaction so that we can use its recursive buffer * locking abilities to detect cycles in the inobt without deadlocking. */ - error = xfs_trans_alloc_empty(breq->mp, &tp); - if (error) - goto out; - + tp = xfs_trans_alloc_empty(breq->mp); if (breq->flags & XFS_IBULK_SAME_AG) iwalk_flags |= XFS_IWALK_SAME_AG; error = xfs_iwalk(breq->mp, tp, breq->startino, iwalk_flags, xfs_bulkstat_iwalk, breq->icount, &bc); xfs_trans_cancel(tp); -out: kfree(bc.buf); /* @@ -464,14 +456,10 @@ xfs_inumbers( * Grab an empty transaction so that we can use its recursive buffer * locking abilities to detect cycles in the inobt without deadlocking. */ - error = xfs_trans_alloc_empty(breq->mp, &tp); - if (error) - goto out; - + tp = xfs_trans_alloc_empty(breq->mp); error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->flags, xfs_inumbers_walk, breq->icount, &ic); xfs_trans_cancel(tp); -out: /* * We found some inode groups, so clear the error status and return diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 7db3ece370b1..c1c31d1a8e21 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -377,11 +377,8 @@ xfs_iwalk_run_callbacks( if (!has_more) return 0; - if (iwag->drop_trans) { - error = xfs_trans_alloc_empty(mp, &iwag->tp); - if (error) - return error; - } + if (iwag->drop_trans) + iwag->tp = xfs_trans_alloc_empty(mp); /* ...and recreate the cursor just past where we left off. */ error = xfs_ialloc_read_agi(iwag->pag, iwag->tp, 0, agi_bpp); @@ -617,9 +614,7 @@ xfs_iwalk_ag_work( * Grab an empty transaction so that we can use its recursive buffer * locking abilities to detect cycles in the inobt without deadlocking. */ - error = xfs_trans_alloc_empty(mp, &iwag->tp); - if (error) - goto out; + iwag->tp = xfs_trans_alloc_empty(mp); iwag->drop_trans = 1; error = xfs_iwalk_ag(iwag); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 793468b4d30d..c8a57e21a1d3 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -109,14 +109,14 @@ xlog_prepare_iovec( vec = &lv->lv_iovecp[0]; } - len = lv->lv_buf_len + sizeof(struct xlog_op_header); + len = lv->lv_buf_used + sizeof(struct xlog_op_header); if (!IS_ALIGNED(len, sizeof(uint64_t))) { - lv->lv_buf_len = round_up(len, sizeof(uint64_t)) - + lv->lv_buf_used = round_up(len, sizeof(uint64_t)) - sizeof(struct xlog_op_header); } vec->i_type = type; - vec->i_addr = lv->lv_buf + lv->lv_buf_len; + vec->i_addr = lv->lv_buf + lv->lv_buf_used; oph = vec->i_addr; oph->oh_clientid = XFS_TRANSACTION; @@ -1931,9 +1931,9 @@ xlog_print_trans( if (!lv) continue; xfs_warn(mp, " niovecs = %d", lv->lv_niovecs); - xfs_warn(mp, " size = %d", lv->lv_size); + xfs_warn(mp, " alloc_size = %d", lv->lv_alloc_size); xfs_warn(mp, " bytes = %d", lv->lv_bytes); - xfs_warn(mp, " buf len = %d", lv->lv_buf_len); + xfs_warn(mp, " buf used= %d", lv->lv_buf_used); /* dump each iovec for the log item */ vec = lv->lv_iovecp; @@ -3092,16 +3092,16 @@ xfs_log_force_seq( */ void xfs_log_ticket_put( - xlog_ticket_t *ticket) + struct xlog_ticket *ticket) { ASSERT(atomic_read(&ticket->t_ref) > 0); if (atomic_dec_and_test(&ticket->t_ref)) kmem_cache_free(xfs_log_ticket_cache, ticket); } -xlog_ticket_t * +struct xlog_ticket * xfs_log_ticket_get( - xlog_ticket_t *ticket) + struct xlog_ticket *ticket) { ASSERT(atomic_read(&ticket->t_ref) > 0); atomic_inc(&ticket->t_ref); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 13455854365f..af6daf4f6792 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -16,8 +16,8 @@ struct xfs_log_vec { struct xfs_log_item *lv_item; /* owner */ char *lv_buf; /* formatted buffer */ int lv_bytes; /* accounted space in buffer */ - int lv_buf_len; /* aligned size of buffer */ - int lv_size; /* size of allocated lv */ + int lv_buf_used; /* buffer space used so far */ + int lv_alloc_size; /* size of allocated lv */ }; #define XFS_LOG_VEC_ORDERED (-1) @@ -64,12 +64,13 @@ xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, oph->oh_len = cpu_to_be32(len); len += sizeof(struct xlog_op_header); - lv->lv_buf_len += len; + lv->lv_buf_used += len; lv->lv_bytes += len; vec->i_len = len; /* Catch buffer overruns */ - ASSERT((void *)lv->lv_buf + lv->lv_bytes <= (void *)lv + lv->lv_size); + ASSERT((void *)lv->lv_buf + lv->lv_bytes <= + (void *)lv + lv->lv_alloc_size); } /* @@ -87,13 +88,6 @@ xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, return buf; } -static inline void * -xlog_copy_from_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, - const struct xfs_log_iovec *src) -{ - return xlog_copy_iovec(lv, vecp, src->i_type, src->i_addr, src->i_len); -} - /* * By comparing each component, we don't have to worry about extra * endian issues in treating two 32 bit numbers as one 64 bit number diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index f66d2d430e4f..f443757e93c2 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -275,7 +275,7 @@ xlog_cil_alloc_shadow_bufs( struct xfs_log_vec *lv; int niovecs = 0; int nbytes = 0; - int buf_size; + int alloc_size; bool ordered = false; /* Skip items which aren't dirty in this transaction. */ @@ -316,14 +316,14 @@ xlog_cil_alloc_shadow_bufs( * that space to ensure we can align it appropriately and not * overrun the buffer. */ - buf_size = nbytes + xlog_cil_iovec_space(niovecs); + alloc_size = nbytes + xlog_cil_iovec_space(niovecs); /* * if we have no shadow buffer, or it is too small, we need to * reallocate it. */ if (!lip->li_lv_shadow || - buf_size > lip->li_lv_shadow->lv_size) { + alloc_size > lip->li_lv_shadow->lv_alloc_size) { /* * We free and allocate here as a realloc would copy * unnecessary data. We don't use kvzalloc() for the @@ -332,15 +332,15 @@ xlog_cil_alloc_shadow_bufs( * storage. */ kvfree(lip->li_lv_shadow); - lv = xlog_kvmalloc(buf_size); + lv = xlog_kvmalloc(alloc_size); memset(lv, 0, xlog_cil_iovec_space(niovecs)); INIT_LIST_HEAD(&lv->lv_list); lv->lv_item = lip; - lv->lv_size = buf_size; + lv->lv_alloc_size = alloc_size; if (ordered) - lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + lv->lv_buf_used = XFS_LOG_VEC_ORDERED; else lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; lip->li_lv_shadow = lv; @@ -348,9 +348,9 @@ xlog_cil_alloc_shadow_bufs( /* same or smaller, optimise common overwrite case */ lv = lip->li_lv_shadow; if (ordered) - lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + lv->lv_buf_used = XFS_LOG_VEC_ORDERED; else - lv->lv_buf_len = 0; + lv->lv_buf_used = 0; lv->lv_bytes = 0; } @@ -370,30 +370,30 @@ xlog_cil_alloc_shadow_bufs( STATIC void xfs_cil_prepare_item( struct xlog *log, + struct xfs_log_item *lip, struct xfs_log_vec *lv, - struct xfs_log_vec *old_lv, int *diff_len) { /* Account for the new LV being passed in */ - if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) + if (lv->lv_buf_used != XFS_LOG_VEC_ORDERED) *diff_len += lv->lv_bytes; /* * If there is no old LV, this is the first time we've seen the item in * this CIL context and so we need to pin it. If we are replacing the - * old_lv, then remove the space it accounts for and make it the shadow + * old lv, then remove the space it accounts for and make it the shadow * buffer for later freeing. In both cases we are now switching to the * shadow buffer, so update the pointer to it appropriately. */ - if (!old_lv) { + if (!lip->li_lv) { if (lv->lv_item->li_ops->iop_pin) lv->lv_item->li_ops->iop_pin(lv->lv_item); lv->lv_item->li_lv_shadow = NULL; - } else if (old_lv != lv) { - ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); + } else if (lip->li_lv != lv) { + ASSERT(lv->lv_buf_used != XFS_LOG_VEC_ORDERED); - *diff_len -= old_lv->lv_bytes; - lv->lv_item->li_lv_shadow = old_lv; + *diff_len -= lip->li_lv->lv_bytes; + lv->lv_item->li_lv_shadow = lip->li_lv; } /* attach new log vector to log item */ @@ -452,10 +452,8 @@ xlog_cil_insert_format_items( } list_for_each_entry(lip, &tp->t_items, li_trans) { - struct xfs_log_vec *lv; - struct xfs_log_vec *old_lv = NULL; - struct xfs_log_vec *shadow; - bool ordered = false; + struct xfs_log_vec *lv = lip->li_lv; + struct xfs_log_vec *shadow = lip->li_lv_shadow; /* Skip items which aren't dirty in this transaction. */ if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) @@ -465,22 +463,23 @@ xlog_cil_insert_format_items( * The formatting size information is already attached to * the shadow lv on the log item. */ - shadow = lip->li_lv_shadow; - if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED) - ordered = true; + if (shadow->lv_buf_used == XFS_LOG_VEC_ORDERED) { + if (!lv) { + lv = shadow; + lv->lv_item = lip; + } + ASSERT(shadow->lv_alloc_size == lv->lv_alloc_size); + xfs_cil_prepare_item(log, lip, lv, diff_len); + continue; + } /* Skip items that do not have any vectors for writing */ - if (!shadow->lv_niovecs && !ordered) + if (!shadow->lv_niovecs) continue; /* compare to existing item size */ - old_lv = lip->li_lv; - if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) { + if (lv && shadow->lv_alloc_size <= lv->lv_alloc_size) { /* same or smaller, optimise common overwrite case */ - lv = lip->li_lv; - - if (ordered) - goto insert; /* * set the item up as though it is a new insertion so @@ -492,7 +491,7 @@ xlog_cil_insert_format_items( lv->lv_niovecs = shadow->lv_niovecs; /* reset the lv buffer information for new formatting */ - lv->lv_buf_len = 0; + lv->lv_buf_used = 0; lv->lv_bytes = 0; lv->lv_buf = (char *)lv + xlog_cil_iovec_space(lv->lv_niovecs); @@ -500,17 +499,11 @@ xlog_cil_insert_format_items( /* switch to shadow buffer! */ lv = shadow; lv->lv_item = lip; - if (ordered) { - /* track as an ordered logvec */ - ASSERT(lip->li_lv == NULL); - goto insert; - } } ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); lip->li_ops->iop_format(lip, lv); -insert: - xfs_cil_prepare_item(log, lv, old_lv, diff_len); + xfs_cil_prepare_item(log, lip, lv, diff_len); } } @@ -793,8 +786,10 @@ xlog_cil_ail_insert( struct xfs_log_item *lip = lv->lv_item; xfs_lsn_t item_lsn; - if (aborted) + if (aborted) { + trace_xlog_ail_insert_abort(lip); set_bit(XFS_LI_ABORTED, &lip->li_flags); + } if (lip->li_ops->flags & XFS_ITEM_RELEASE_WHEN_COMMITTED) { lip->li_ops->iop_release(lip); @@ -1243,7 +1238,7 @@ xlog_cil_build_lv_chain( lv->lv_order_id = item->li_order_id; /* we don't write ordered log vectors */ - if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) + if (lv->lv_buf_used != XFS_LOG_VEC_ORDERED) *num_bytes += lv->lv_bytes; *num_iovecs += lv->lv_niovecs; list_add_tail(&lv->lv_list, &ctx->lv_chain); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 39a102cc1b43..a9a7a271c15b 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -144,7 +144,7 @@ enum xlog_iclog_state { #define XLOG_COVER_OPS 5 -typedef struct xlog_ticket { +struct xlog_ticket { struct list_head t_queue; /* reserve/write queue */ struct task_struct *t_task; /* task that owns this ticket */ xlog_tid_t t_tid; /* transaction identifier */ @@ -155,7 +155,7 @@ typedef struct xlog_ticket { char t_cnt; /* current unit count */ uint8_t t_flags; /* properties of reservation */ int t_iclog_hdrs; /* iclog hdrs in t_curr_res */ -} xlog_ticket_t; +}; /* * - A log record header is 512 bytes. There is plenty of room to grow the diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 2f76531842f8..e6ed9e09c027 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2131,15 +2131,15 @@ xlog_recover_add_to_cont_trans( item = list_entry(trans->r_itemq.prev, struct xlog_recover_item, ri_list); - old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; - old_len = item->ri_buf[item->ri_cnt-1].i_len; + old_ptr = item->ri_buf[item->ri_cnt-1].iov_base; + old_len = item->ri_buf[item->ri_cnt-1].iov_len; ptr = kvrealloc(old_ptr, len + old_len, GFP_KERNEL); if (!ptr) return -ENOMEM; memcpy(&ptr[old_len], dp, len); - item->ri_buf[item->ri_cnt-1].i_len += len; - item->ri_buf[item->ri_cnt-1].i_addr = ptr; + item->ri_buf[item->ri_cnt-1].iov_len += len; + item->ri_buf[item->ri_cnt-1].iov_base = ptr; trace_xfs_log_recover_item_add_cont(log, trans, item, 0); return 0; } @@ -2223,7 +2223,7 @@ xlog_recover_add_to_trans( } item->ri_total = in_f->ilf_size; - item->ri_buf = kzalloc(item->ri_total * sizeof(xfs_log_iovec_t), + item->ri_buf = kcalloc(item->ri_total, sizeof(*item->ri_buf), GFP_KERNEL | __GFP_NOFAIL); } @@ -2237,8 +2237,8 @@ xlog_recover_add_to_trans( } /* Description region is ri_buf[0] */ - item->ri_buf[item->ri_cnt].i_addr = ptr; - item->ri_buf[item->ri_cnt].i_len = len; + item->ri_buf[item->ri_cnt].iov_base = ptr; + item->ri_buf[item->ri_cnt].iov_len = len; item->ri_cnt++; trace_xfs_log_recover_item_add(log, trans, item, 0); return 0; @@ -2262,7 +2262,7 @@ xlog_recover_free_trans( /* Free the regions in the item. */ list_del(&item->ri_list); for (i = 0; i < item->ri_cnt; i++) - kvfree(item->ri_buf[i].i_addr); + kvfree(item->ri_buf[i].iov_base); /* Free the item itself */ kfree(item->ri_buf); kfree(item); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 29276fe60df9..2133fbaf1766 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -171,19 +171,16 @@ xfs_readsb( ASSERT(mp->m_ddev_targp != NULL); /* - * For the initial read, we must guess at the sector - * size based on the block device. It's enough to - * get the sb_sectsize out of the superblock and - * then reread with the proper length. - * We don't verify it yet, because it may not be complete. + * In the first pass, use the device sector size to just read enough + * of the superblock to extract the XFS sector size. + * + * The device sector size must be smaller than or equal to the XFS + * sector size and thus we can always read the superblock. Once we know + * the XFS sector size, re-read it and run the buffer verifier. */ - sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); + sector_size = mp->m_ddev_targp->bt_logical_sectorsize; buf_ops = NULL; - /* - * Allocate a (locked) buffer to hold the superblock. This will be kept - * around at all times to optimize access to the superblock. - */ reread: error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), &bp, buf_ops); @@ -247,6 +244,10 @@ reread: /* no need to be quiet anymore, so reset the buf ops */ bp->b_ops = &xfs_sb_buf_ops; + /* + * Keep a pointer of the sb buffer around instead of caching it in the + * buffer cache because we access it frequently. + */ mp->m_sb_bp = bp; xfs_buf_unlock(bp); return 0; @@ -672,74 +673,47 @@ static inline xfs_extlen_t xfs_calc_atomic_write_max(struct xfs_mount *mp) return rounddown_pow_of_two(XFS_B_TO_FSB(mp, MAX_RW_COUNT)); } -static inline unsigned int max_pow_of_two_factor(const unsigned int nr) -{ - return 1 << (ffs(nr) - 1); -} - /* - * If the data device advertises atomic write support, limit the size of data - * device atomic writes to the greatest power-of-two factor of the AG size so - * that every atomic write unit aligns with the start of every AG. This is - * required so that the per-AG allocations for an atomic write will always be + * If the underlying device advertises atomic write support, limit the size of + * atomic writes to the greatest power-of-two factor of the group size so + * that every atomic write unit aligns with the start of every group. This is + * required so that the allocations for an atomic write will always be * aligned compatibly with the alignment requirements of the storage. * - * If the data device doesn't advertise atomic writes, then there are no - * alignment restrictions and the largest out-of-place write we can do - * ourselves is the number of blocks that user files can allocate from any AG. + * If the device doesn't advertise atomic writes, then there are no alignment + * restrictions and the largest out-of-place write we can do ourselves is the + * number of blocks that user files can allocate from any group. */ -static inline xfs_extlen_t xfs_calc_perag_awu_max(struct xfs_mount *mp) -{ - if (mp->m_ddev_targp->bt_bdev_awu_min > 0) - return max_pow_of_two_factor(mp->m_sb.sb_agblocks); - return rounddown_pow_of_two(mp->m_ag_max_usable); -} - -/* - * Reflink on the realtime device requires rtgroups, and atomic writes require - * reflink. - * - * If the realtime device advertises atomic write support, limit the size of - * data device atomic writes to the greatest power-of-two factor of the rtgroup - * size so that every atomic write unit aligns with the start of every rtgroup. - * This is required so that the per-rtgroup allocations for an atomic write - * will always be aligned compatibly with the alignment requirements of the - * storage. - * - * If the rt device doesn't advertise atomic writes, then there are no - * alignment restrictions and the largest out-of-place write we can do - * ourselves is the number of blocks that user files can allocate from any - * rtgroup. - */ -static inline xfs_extlen_t xfs_calc_rtgroup_awu_max(struct xfs_mount *mp) +static xfs_extlen_t +xfs_calc_group_awu_max( + struct xfs_mount *mp, + enum xfs_group_type type) { - struct xfs_groups *rgs = &mp->m_groups[XG_TYPE_RTG]; + struct xfs_groups *g = &mp->m_groups[type]; + struct xfs_buftarg *btp = xfs_group_type_buftarg(mp, type); - if (rgs->blocks == 0) + if (g->blocks == 0) return 0; - if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_bdev_awu_min > 0) - return max_pow_of_two_factor(rgs->blocks); - return rounddown_pow_of_two(rgs->blocks); + if (btp && btp->bt_awu_min > 0) + return max_pow_of_two_factor(g->blocks); + return rounddown_pow_of_two(g->blocks); } /* Compute the maximum atomic write unit size for each section. */ static inline void xfs_calc_atomic_write_unit_max( - struct xfs_mount *mp) + struct xfs_mount *mp, + enum xfs_group_type type) { - struct xfs_groups *ags = &mp->m_groups[XG_TYPE_AG]; - struct xfs_groups *rgs = &mp->m_groups[XG_TYPE_RTG]; + struct xfs_groups *g = &mp->m_groups[type]; const xfs_extlen_t max_write = xfs_calc_atomic_write_max(mp); const xfs_extlen_t max_ioend = xfs_reflink_max_atomic_cow(mp); - const xfs_extlen_t max_agsize = xfs_calc_perag_awu_max(mp); - const xfs_extlen_t max_rgsize = xfs_calc_rtgroup_awu_max(mp); - - ags->awu_max = min3(max_write, max_ioend, max_agsize); - rgs->awu_max = min3(max_write, max_ioend, max_rgsize); + const xfs_extlen_t max_gsize = xfs_calc_group_awu_max(mp, type); - trace_xfs_calc_atomic_write_unit_max(mp, max_write, max_ioend, - max_agsize, max_rgsize); + g->awu_max = min3(max_write, max_ioend, max_gsize); + trace_xfs_calc_atomic_write_unit_max(mp, type, max_write, max_ioend, + max_gsize, g->awu_max); } /* @@ -757,7 +731,8 @@ xfs_set_max_atomic_write_opt( max(mp->m_groups[XG_TYPE_AG].blocks, mp->m_groups[XG_TYPE_RTG].blocks); const xfs_extlen_t max_group_write = - max(xfs_calc_perag_awu_max(mp), xfs_calc_rtgroup_awu_max(mp)); + max(xfs_calc_group_awu_max(mp, XG_TYPE_AG), + xfs_calc_group_awu_max(mp, XG_TYPE_RTG)); int error; if (new_max_bytes == 0) @@ -813,7 +788,8 @@ set_limit: return error; } - xfs_calc_atomic_write_unit_max(mp); + xfs_calc_atomic_write_unit_max(mp, XG_TYPE_AG); + xfs_calc_atomic_write_unit_max(mp, XG_TYPE_RTG); mp->m_awu_max_bytes = new_max_bytes; return 0; } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index d85084f9f317..97de44c32272 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -802,4 +802,21 @@ static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta) int xfs_set_max_atomic_write_opt(struct xfs_mount *mp, unsigned long long new_max_bytes); +static inline struct xfs_buftarg * +xfs_group_type_buftarg( + struct xfs_mount *mp, + enum xfs_group_type type) +{ + switch (type) { + case XG_TYPE_AG: + return mp->m_ddev_targp; + case XG_TYPE_RTG: + return mp->m_rtdev_targp; + default: + ASSERT(0); + break; + } + return NULL; +} + #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 08443ceec329..866c71d9fbae 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -320,7 +320,7 @@ xfs_mru_cache_create( xfs_mru_cache_free_func_t free_func) { struct xfs_mru_cache *mru = NULL; - int err = 0, grp; + int grp; unsigned int grp_time; if (mrup) @@ -341,8 +341,8 @@ xfs_mru_cache_create( mru->lists = kzalloc(mru->grp_count * sizeof(*mru->lists), GFP_KERNEL | __GFP_NOFAIL); if (!mru->lists) { - err = -ENOMEM; - goto exit; + kfree(mru); + return -ENOMEM; } for (grp = 0; grp < mru->grp_count; grp++) @@ -361,14 +361,7 @@ xfs_mru_cache_create( mru->free_func = free_func; mru->data = data; *mrup = mru; - -exit: - if (err && mru && mru->lists) - kfree(mru->lists); - if (err && mru) - kfree(mru); - - return err; + return 0; } /* @@ -425,10 +418,6 @@ xfs_mru_cache_insert( { int error = -EINVAL; - ASSERT(mru && mru->lists); - if (!mru || !mru->lists) - goto out_free; - error = -ENOMEM; if (radix_tree_preload(GFP_KERNEL)) goto out_free; diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index 3545dc1d953c..fbeddcac4792 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -253,8 +253,7 @@ xfs_dax_notify_dev_failure( return -EOPNOTSUPP; } - error = xfs_dax_translate_range(type == XG_TYPE_RTG ? - mp->m_rtdev_targp : mp->m_ddev_targp, + error = xfs_dax_translate_range(xfs_group_type_buftarg(mp, type), offset, len, &daddr, &bblen); if (error) return error; @@ -280,10 +279,7 @@ xfs_dax_notify_dev_failure( kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0; } - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - goto out; - + tp = xfs_trans_alloc_empty(mp); start_gno = xfs_fsb_to_gno(mp, start_bno, type); end_gno = xfs_fsb_to_gno(mp, end_bno, type); while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) { @@ -354,7 +350,6 @@ xfs_dax_notify_dev_failure( error = -EFSCORRUPTED; } -out: /* Thaw the fs if it has been frozen before. */ if (mf_flags & MF_MEM_PRE_REMOVE) xfs_dax_notify_failure_thaw(mp, kernel_frozen); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 417439b58785..23ba84ec919a 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -134,6 +134,7 @@ xfs_qm_dqpurge( dqp->q_flags |= XFS_DQFLAG_FREEING; + xfs_qm_dqunpin_wait(dqp); xfs_dqflock(dqp); /* @@ -465,6 +466,7 @@ xfs_qm_dquot_isolate( struct xfs_dquot *dqp = container_of(item, struct xfs_dquot, q_lru); struct xfs_qm_isolate *isol = arg; + enum lru_status ret = LRU_SKIP; if (!xfs_dqlock_nowait(dqp)) goto out_miss_busy; @@ -478,6 +480,16 @@ xfs_qm_dquot_isolate( goto out_miss_unlock; /* + * If the dquot is pinned or dirty, rotate it to the end of the LRU to + * give some time for it to be cleaned before we try to isolate it + * again. + */ + ret = LRU_ROTATE; + if (XFS_DQ_IS_DIRTY(dqp) || atomic_read(&dqp->q_pincount) > 0) { + goto out_miss_unlock; + } + + /* * This dquot has acquired a reference in the meantime remove it from * the freelist and try again. */ @@ -492,41 +504,14 @@ xfs_qm_dquot_isolate( } /* - * If the dquot is dirty, flush it. If it's already being flushed, just - * skip it so there is time for the IO to complete before we try to - * reclaim it again on the next LRU pass. + * The dquot may still be under IO, in which case the flush lock will be + * held. If we can't get the flush lock now, just skip over the dquot as + * if it was dirty. */ if (!xfs_dqflock_nowait(dqp)) goto out_miss_unlock; - if (XFS_DQ_IS_DIRTY(dqp)) { - struct xfs_buf *bp = NULL; - int error; - - trace_xfs_dqreclaim_dirty(dqp); - - /* we have to drop the LRU lock to flush the dquot */ - spin_unlock(&lru->lock); - - error = xfs_dquot_use_attached_buf(dqp, &bp); - if (!bp || error == -EAGAIN) { - xfs_dqfunlock(dqp); - goto out_unlock_dirty; - } - - /* - * dqflush completes dqflock on error, and the delwri ioend - * does it on success. - */ - error = xfs_qm_dqflush(dqp, bp); - if (error) - goto out_unlock_dirty; - - xfs_buf_delwri_queue(bp, &isol->buffers); - xfs_buf_relse(bp); - goto out_unlock_dirty; - } - + ASSERT(!XFS_DQ_IS_DIRTY(dqp)); xfs_dquot_detach_buf(dqp); xfs_dqfunlock(dqp); @@ -548,13 +533,7 @@ out_miss_unlock: out_miss_busy: trace_xfs_dqreclaim_busy(dqp); XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); - return LRU_SKIP; - -out_unlock_dirty: - trace_xfs_dqreclaim_busy(dqp); - XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); - xfs_dqunlock(dqp); - return LRU_RETRY; + return ret; } static unsigned long @@ -681,10 +660,7 @@ xfs_qm_load_metadir_qinos( struct xfs_trans *tp; int error; - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - return error; - + tp = xfs_trans_alloc_empty(mp); error = xfs_dqinode_load_parent(tp, &qi->qi_dirip); if (error == -ENOENT) { /* no quota dir directory, but we'll create one later */ @@ -1486,7 +1462,6 @@ xfs_qm_flush_one( struct xfs_dquot *dqp, void *data) { - struct xfs_mount *mp = dqp->q_mount; struct list_head *buffer_list = data; struct xfs_buf *bp = NULL; int error = 0; @@ -1497,34 +1472,8 @@ xfs_qm_flush_one( if (!XFS_DQ_IS_DIRTY(dqp)) goto out_unlock; - /* - * The only way the dquot is already flush locked by the time quotacheck - * gets here is if reclaim flushed it before the dqadjust walk dirtied - * it for the final time. Quotacheck collects all dquot bufs in the - * local delwri queue before dquots are dirtied, so reclaim can't have - * possibly queued it for I/O. The only way out is to push the buffer to - * cycle the flush lock. - */ - if (!xfs_dqflock_nowait(dqp)) { - /* buf is pinned in-core by delwri list */ - error = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0, &bp); - if (error) - goto out_unlock; - - if (!(bp->b_flags & _XBF_DELWRI_Q)) { - error = -EAGAIN; - xfs_buf_relse(bp); - goto out_unlock; - } - xfs_buf_unlock(bp); - - xfs_buf_delwri_pushbuf(bp, buffer_list); - xfs_buf_rele(bp); - - error = -EAGAIN; - goto out_unlock; - } + xfs_qm_dqunpin_wait(dqp); + xfs_dqflock(dqp); error = xfs_dquot_use_attached_buf(dqp, &bp); if (error) @@ -1803,10 +1752,7 @@ xfs_qm_qino_load( struct xfs_inode *dp = NULL; int error; - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - return error; - + tp = xfs_trans_alloc_empty(mp); if (xfs_has_metadir(mp)) { error = xfs_dqinode_load_parent(tp, &dp); if (error) diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 076501123d89..3728234699a2 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -717,18 +717,18 @@ xlog_recover_cui_commit_pass2( struct xfs_cui_log_format *cui_formatp; size_t len; - cui_formatp = item->ri_buf[0].i_addr; + cui_formatp = item->ri_buf[0].iov_base; - if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) { + if (item->ri_buf[0].iov_len < xfs_cui_log_format_sizeof(0)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents); - if (item->ri_buf[0].i_len != len) { + if (item->ri_buf[0].iov_len != len) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } @@ -759,18 +759,18 @@ xlog_recover_rtcui_commit_pass2( struct xfs_cui_log_format *cui_formatp; size_t len; - cui_formatp = item->ri_buf[0].i_addr; + cui_formatp = item->ri_buf[0].iov_base; - if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) { + if (item->ri_buf[0].iov_len < xfs_cui_log_format_sizeof(0)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents); - if (item->ri_buf[0].i_len != len) { + if (item->ri_buf[0].iov_len != len) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } @@ -791,7 +791,7 @@ xlog_recover_rtcui_commit_pass2( xfs_lsn_t lsn) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } #endif @@ -817,10 +817,10 @@ xlog_recover_cud_commit_pass2( { struct xfs_cud_log_format *cud_formatp; - cud_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) { + cud_formatp = item->ri_buf[0].iov_base; + if (item->ri_buf[0].iov_len != sizeof(struct xfs_cud_log_format)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } @@ -843,10 +843,10 @@ xlog_recover_rtcud_commit_pass2( { struct xfs_cud_log_format *cud_formatp; - cud_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) { + cud_formatp = item->ri_buf[0].iov_base; + if (item->ri_buf[0].iov_len != sizeof(struct xfs_cud_log_format)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index ad3bcb76d805..3f177b4ec131 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1881,7 +1881,8 @@ xfs_reflink_unshare( &xfs_dax_write_iomap_ops); else error = iomap_file_unshare(inode, offset, len, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, + &xfs_iomap_write_ops); if (error) goto out; diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index c99700318ec2..15f0903f6fd4 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -746,18 +746,18 @@ xlog_recover_rui_commit_pass2( struct xfs_rui_log_format *rui_formatp; size_t len; - rui_formatp = item->ri_buf[0].i_addr; + rui_formatp = item->ri_buf[0].iov_base; - if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) { + if (item->ri_buf[0].iov_len < xfs_rui_log_format_sizeof(0)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents); - if (item->ri_buf[0].i_len != len) { + if (item->ri_buf[0].iov_len != len) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } @@ -788,18 +788,18 @@ xlog_recover_rtrui_commit_pass2( struct xfs_rui_log_format *rui_formatp; size_t len; - rui_formatp = item->ri_buf[0].i_addr; + rui_formatp = item->ri_buf[0].iov_base; - if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) { + if (item->ri_buf[0].iov_len < xfs_rui_log_format_sizeof(0)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents); - if (item->ri_buf[0].i_len != len) { + if (item->ri_buf[0].iov_len != len) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } @@ -820,7 +820,7 @@ xlog_recover_rtrui_commit_pass2( xfs_lsn_t lsn) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } #endif @@ -846,10 +846,10 @@ xlog_recover_rud_commit_pass2( { struct xfs_rud_log_format *rud_formatp; - rud_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_rud_log_format)) { + rud_formatp = item->ri_buf[0].iov_base; + if (item->ri_buf[0].iov_len != sizeof(struct xfs_rud_log_format)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, - rud_formatp, item->ri_buf[0].i_len); + rud_formatp, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } @@ -872,10 +872,10 @@ xlog_recover_rtrud_commit_pass2( { struct xfs_rud_log_format *rud_formatp; - rud_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_rud_log_format)) { + rud_formatp = item->ri_buf[0].iov_base; + if (item->ri_buf[0].iov_len != sizeof(struct xfs_rud_log_format)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, - rud_formatp, item->ri_buf[0].i_len); + rud_formatp, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 6484c596ecea..6907e871fa15 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -729,9 +729,7 @@ xfs_rtginode_ensure( if (rtg->rtg_inodes[type]) return 0; - error = xfs_trans_alloc_empty(rtg_mount(rtg), &tp); - if (error) - return error; + tp = xfs_trans_alloc_empty(rtg_mount(rtg)); error = xfs_rtginode_load(rtg, type, tp); xfs_trans_cancel(tp); @@ -1259,6 +1257,8 @@ xfs_growfs_check_rtgeom( kfree(nmp); + trace_xfs_growfs_check_rtgeom(mp, min_logfsbs); + if (min_logfsbs > mp->m_sb.sb_logblocks) return -EINVAL; @@ -1303,9 +1303,7 @@ xfs_growfs_rt_prep_groups( if (!mp->m_rtdirip) { struct xfs_trans *tp; - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - return error; + tp = xfs_trans_alloc_empty(mp); error = xfs_rtginode_load_parent(tp); xfs_trans_cancel(tp); @@ -1672,10 +1670,7 @@ xfs_rtmount_inodes( struct xfs_rtgroup *rtg = NULL; int error; - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - return error; - + tp = xfs_trans_alloc_empty(mp); if (xfs_has_rtgroups(mp) && mp->m_sb.sb_rgcount > 0) { error = xfs_rtginode_load_parent(tp); if (error) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 0bc4b5489078..bb0a82635a77 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -2020,14 +2020,13 @@ xfs_remount_rw( int error; if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp && - bdev_read_only(mp->m_logdev_targp->bt_bdev)) { + xfs_readonly_buftarg(mp->m_logdev_targp)) { xfs_warn(mp, "ro->rw transition prohibited by read-only logdev"); return -EACCES; } - if (mp->m_rtdev_targp && - bdev_read_only(mp->m_rtdev_targp->bt_bdev)) { + if (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp)) { xfs_warn(mp, "ro->rw transition prohibited by read-only rtdev"); return -EACCES; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 01d284a1c759..e1794e3e3156 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -171,36 +171,33 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list); DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list); TRACE_EVENT(xfs_calc_atomic_write_unit_max, - TP_PROTO(struct xfs_mount *mp, unsigned int max_write, - unsigned int max_ioend, unsigned int max_agsize, - unsigned int max_rgsize), - TP_ARGS(mp, max_write, max_ioend, max_agsize, max_rgsize), + TP_PROTO(struct xfs_mount *mp, enum xfs_group_type type, + unsigned int max_write, unsigned int max_ioend, + unsigned int max_gsize, unsigned int awu_max), + TP_ARGS(mp, type, max_write, max_ioend, max_gsize, awu_max), TP_STRUCT__entry( __field(dev_t, dev) + __field(enum xfs_group_type, type) __field(unsigned int, max_write) __field(unsigned int, max_ioend) - __field(unsigned int, max_agsize) - __field(unsigned int, max_rgsize) - __field(unsigned int, data_awu_max) - __field(unsigned int, rt_awu_max) + __field(unsigned int, max_gsize) + __field(unsigned int, awu_max) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; + __entry->type = type; __entry->max_write = max_write; __entry->max_ioend = max_ioend; - __entry->max_agsize = max_agsize; - __entry->max_rgsize = max_rgsize; - __entry->data_awu_max = mp->m_groups[XG_TYPE_AG].awu_max; - __entry->rt_awu_max = mp->m_groups[XG_TYPE_RTG].awu_max; + __entry->max_gsize = max_gsize; + __entry->awu_max = awu_max; ), - TP_printk("dev %d:%d max_write %u max_ioend %u max_agsize %u max_rgsize %u data_awu_max %u rt_awu_max %u", + TP_printk("dev %d:%d %s max_write %u max_ioend %u max_gsize %u awu_max %u", MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), __entry->max_write, __entry->max_ioend, - __entry->max_agsize, - __entry->max_rgsize, - __entry->data_awu_max, - __entry->rt_awu_max) + __entry->max_gsize, + __entry->awu_max) ); TRACE_EVENT(xfs_calc_max_atomic_write_fsblocks, @@ -428,8 +425,8 @@ DECLARE_EVENT_CLASS(xfs_zone_alloc_class, __field(dev_t, dev) __field(xfs_rgnumber_t, rgno) __field(xfs_rgblock_t, used) + __field(xfs_rgblock_t, allocated) __field(xfs_rgblock_t, written) - __field(xfs_rgblock_t, write_pointer) __field(xfs_rgblock_t, rgbno) __field(xfs_extlen_t, len) ), @@ -437,17 +434,17 @@ DECLARE_EVENT_CLASS(xfs_zone_alloc_class, __entry->dev = rtg_mount(oz->oz_rtg)->m_super->s_dev; __entry->rgno = rtg_rgno(oz->oz_rtg); __entry->used = rtg_rmap(oz->oz_rtg)->i_used_blocks; + __entry->allocated = oz->oz_allocated; __entry->written = oz->oz_written; - __entry->write_pointer = oz->oz_write_pointer; __entry->rgbno = rgbno; __entry->len = len; ), - TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x rgbno 0x%x len 0x%x", + TP_printk("dev %d:%d rgno 0x%x used 0x%x alloced 0x%x written 0x%x rgbno 0x%x len 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rgno, __entry->used, + __entry->allocated, __entry->written, - __entry->write_pointer, __entry->rgbno, __entry->len) ); @@ -778,7 +775,6 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done); DEFINE_BUF_EVENT(xfs_buf_delwri_queue); DEFINE_BUF_EVENT(xfs_buf_delwri_queued); DEFINE_BUF_EVENT(xfs_buf_delwri_split); -DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf); DEFINE_BUF_EVENT(xfs_buf_get_uncached); DEFINE_BUF_EVENT(xfs_buf_item_relse); DEFINE_BUF_EVENT(xfs_buf_iodone_async); @@ -1083,7 +1079,9 @@ DEFINE_INODE_EVENT(xfs_get_acl); #endif DEFINE_INODE_EVENT(xfs_vm_bmap); DEFINE_INODE_EVENT(xfs_file_ioctl); +#ifdef CONFIG_COMPAT DEFINE_INODE_EVENT(xfs_file_compat_ioctl); +#endif DEFINE_INODE_EVENT(xfs_ioctl_setattr); DEFINE_INODE_EVENT(xfs_dir_fsync); DEFINE_INODE_EVENT(xfs_file_fsync); @@ -1147,6 +1145,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class, __field(xfs_ino_t, ino) __field(int, count) __field(int, pincount) + __field(unsigned long, iflags) __field(unsigned long, caller_ip) ), TP_fast_assign( @@ -1154,13 +1153,15 @@ DECLARE_EVENT_CLASS(xfs_iref_class, __entry->ino = ip->i_ino; __entry->count = atomic_read(&VFS_I(ip)->i_count); __entry->pincount = atomic_read(&ip->i_pincount); + __entry->iflags = ip->i_flags; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pS", + TP_printk("dev %d:%d ino 0x%llx count %d pincount %d iflags 0x%lx caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->count, __entry->pincount, + __entry->iflags, (char *)__entry->caller_ip) ) @@ -1250,6 +1251,8 @@ DEFINE_IREF_EVENT(xfs_irele); DEFINE_IREF_EVENT(xfs_inode_pin); DEFINE_IREF_EVENT(xfs_inode_unpin); DEFINE_IREF_EVENT(xfs_inode_unpin_nowait); +DEFINE_IREF_EVENT(xfs_inode_push_pinned); +DEFINE_IREF_EVENT(xfs_inode_push_stale); DECLARE_EVENT_CLASS(xfs_namespace_class, TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name), @@ -1393,7 +1396,6 @@ DEFINE_EVENT(xfs_dquot_class, name, \ TP_ARGS(dqp)) DEFINE_DQUOT_EVENT(xfs_dqadjust); DEFINE_DQUOT_EVENT(xfs_dqreclaim_want); -DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty); DEFINE_DQUOT_EVENT(xfs_dqreclaim_busy); DEFINE_DQUOT_EVENT(xfs_dqreclaim_done); DEFINE_DQUOT_EVENT(xfs_dqattach_found); @@ -1598,7 +1600,6 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant); DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_sub); DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_exit); DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait); -DEFINE_LOGGRANT_EVENT(xfs_log_cil_return); DECLARE_EVENT_CLASS(xfs_log_item_class, TP_PROTO(struct xfs_log_item *lip), @@ -1654,6 +1655,8 @@ DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing); DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_mark); DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_skip); DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_unpin); +DEFINE_LOG_ITEM_EVENT(xlog_ail_insert_abort); +DEFINE_LOG_ITEM_EVENT(xfs_trans_free_abort); DECLARE_EVENT_CLASS(xfs_ail_class, TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), @@ -1859,8 +1862,6 @@ DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize); DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof); DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write); -DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten); -DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append); DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read); DEFINE_SIMPLE_IO_EVENT(xfs_zoned_map_blocks); @@ -1893,31 +1894,6 @@ DEFINE_EVENT(xfs_itrunc_class, name, \ DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start); DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end); -TRACE_EVENT(xfs_pagecache_inval, - TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish), - TP_ARGS(ip, start, finish), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(xfs_fsize_t, size) - __field(xfs_off_t, start) - __field(xfs_off_t, finish) - ), - TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->size = ip->i_disk_size; - __entry->start = start; - __entry->finish = finish; - ), - TP_printk("dev %d:%d ino 0x%llx disize 0x%llx start 0x%llx finish 0x%llx", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->size, - __entry->start, - __entry->finish) -); - TRACE_EVENT(xfs_bunmap, TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t fileoff, xfs_filblks_t len, int flags, unsigned long caller_ip), @@ -2263,14 +2239,12 @@ DEFINE_EVENT(xfs_alloc_class, name, \ DEFINE_ALLOC_EVENT(xfs_alloc_exact_done); DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound); DEFINE_ALLOC_EVENT(xfs_alloc_exact_error); -DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft); DEFINE_ALLOC_EVENT(xfs_alloc_near_first); DEFINE_ALLOC_EVENT(xfs_alloc_cur); DEFINE_ALLOC_EVENT(xfs_alloc_cur_right); DEFINE_ALLOC_EVENT(xfs_alloc_cur_left); DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup); DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup_done); -DEFINE_ALLOC_EVENT(xfs_alloc_near_error); DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry); DEFINE_ALLOC_EVENT(xfs_alloc_near_busy); DEFINE_ALLOC_EVENT(xfs_alloc_size_neither); @@ -2465,13 +2439,8 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall); DEFINE_ATTR_EVENT(xfs_attr_node_addname); DEFINE_ATTR_EVENT(xfs_attr_node_get); DEFINE_ATTR_EVENT(xfs_attr_node_replace); -DEFINE_ATTR_EVENT(xfs_attr_node_removename); - -DEFINE_ATTR_EVENT(xfs_attr_fillstate); -DEFINE_ATTR_EVENT(xfs_attr_refillstate); DEFINE_ATTR_EVENT(xfs_attr_rmtval_get); -DEFINE_ATTR_EVENT(xfs_attr_rmtval_set); #define DEFINE_DA_EVENT(name) \ DEFINE_EVENT(xfs_da_class, name, \ @@ -2889,7 +2858,6 @@ DEFINE_EVENT(xfs_rtdiscard_class, name, \ TP_ARGS(mp, rtbno, len)) DEFINE_RTDISCARD_EVENT(xfs_discard_rtextent); DEFINE_RTDISCARD_EVENT(xfs_discard_rttoosmall); -DEFINE_RTDISCARD_EVENT(xfs_discard_rtrelax); DECLARE_EVENT_CLASS(xfs_btree_cur_class, TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp), @@ -4206,36 +4174,6 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error); DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_src); DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_dest); -/* dedupe tracepoints */ -DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_compare_extents_error); - -/* ioctl tracepoints */ -TRACE_EVENT(xfs_ioctl_clone, - TP_PROTO(struct inode *src, struct inode *dest), - TP_ARGS(src, dest), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(unsigned long, src_ino) - __field(loff_t, src_isize) - __field(unsigned long, dest_ino) - __field(loff_t, dest_isize) - ), - TP_fast_assign( - __entry->dev = src->i_sb->s_dev; - __entry->src_ino = src->i_ino; - __entry->src_isize = i_size_read(src); - __entry->dest_ino = dest->i_ino; - __entry->dest_isize = i_size_read(dest); - ), - TP_printk("dev %d:%d ino 0x%lx isize 0x%llx -> ino 0x%lx isize 0x%llx", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->src_ino, - __entry->src_isize, - __entry->dest_ino, - __entry->dest_isize) -); - /* unshare tracepoints */ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare); DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error); @@ -4243,7 +4181,6 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error); /* copy on write */ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); -DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); @@ -5028,7 +4965,6 @@ DEFINE_ICLOG_EVENT(xlog_iclog_switch); DEFINE_ICLOG_EVENT(xlog_iclog_sync); DEFINE_ICLOG_EVENT(xlog_iclog_syncing); DEFINE_ICLOG_EVENT(xlog_iclog_sync_done); -DEFINE_ICLOG_EVENT(xlog_iclog_want_sync); DEFINE_ICLOG_EVENT(xlog_iclog_wait_on); DEFINE_ICLOG_EVENT(xlog_iclog_write); @@ -5077,7 +5013,6 @@ DEFINE_DAS_STATE_EVENT(xfs_attr_sf_addname_return); DEFINE_DAS_STATE_EVENT(xfs_attr_set_iter_return); DEFINE_DAS_STATE_EVENT(xfs_attr_leaf_addname_return); DEFINE_DAS_STATE_EVENT(xfs_attr_node_addname_return); -DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return); DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc); DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return); DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index c6657072361a..ece374d622b3 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -134,18 +134,14 @@ xfs_trans_dup( } /* - * This is called to reserve free disk blocks and log space for the - * given transaction. This must be done before allocating any resources - * within the transaction. + * This is called to reserve free disk blocks and log space for the given + * transaction before allocating any resources within the transaction. * * This will return ENOSPC if there are not enough blocks available. * It will sleep waiting for available log space. - * The only valid value for the flags parameter is XFS_RES_LOG_PERM, which - * is used by long running transactions. If any one of the reservations - * fails then they will all be backed out. * - * This does not do quota reservations. That typically is done by the - * caller afterwards. + * This does not do quota reservations. That typically is done by the caller + * afterwards. */ static int xfs_trans_reserve( @@ -158,10 +154,12 @@ xfs_trans_reserve( int error = 0; bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + ASSERT(resp->tr_logres > 0); + /* - * Attempt to reserve the needed disk blocks by decrementing - * the number needed from the number available. This will - * fail if the count would go below zero. + * Attempt to reserve the needed disk blocks by decrementing the number + * needed from the number available. This will fail if the count would + * go below zero. */ if (blocks > 0) { error = xfs_dec_fdblocks(mp, blocks, rsvd); @@ -173,42 +171,20 @@ xfs_trans_reserve( /* * Reserve the log space needed for this transaction. */ - if (resp->tr_logres > 0) { - bool permanent = false; - - ASSERT(tp->t_log_res == 0 || - tp->t_log_res == resp->tr_logres); - ASSERT(tp->t_log_count == 0 || - tp->t_log_count == resp->tr_logcount); - - if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) { - tp->t_flags |= XFS_TRANS_PERM_LOG_RES; - permanent = true; - } else { - ASSERT(tp->t_ticket == NULL); - ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES)); - } - - if (tp->t_ticket != NULL) { - ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES); - error = xfs_log_regrant(mp, tp->t_ticket); - } else { - error = xfs_log_reserve(mp, resp->tr_logres, - resp->tr_logcount, - &tp->t_ticket, permanent); - } - - if (error) - goto undo_blocks; + if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) + tp->t_flags |= XFS_TRANS_PERM_LOG_RES; + error = xfs_log_reserve(mp, resp->tr_logres, resp->tr_logcount, + &tp->t_ticket, (tp->t_flags & XFS_TRANS_PERM_LOG_RES)); + if (error) + goto undo_blocks; - tp->t_log_res = resp->tr_logres; - tp->t_log_count = resp->tr_logcount; - } + tp->t_log_res = resp->tr_logres; + tp->t_log_count = resp->tr_logcount; /* - * Attempt to reserve the needed realtime extents by decrementing - * the number needed from the number available. This will - * fail if the count would go below zero. + * Attempt to reserve the needed realtime extents by decrementing the + * number needed from the number available. This will fail if the + * count would go below zero. */ if (rtextents > 0) { error = xfs_dec_frextents(mp, rtextents); @@ -221,18 +197,11 @@ xfs_trans_reserve( return 0; - /* - * Error cases jump to one of these labels to undo any - * reservations which have already been performed. - */ undo_log: - if (resp->tr_logres > 0) { - xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); - tp->t_ticket = NULL; - tp->t_log_res = 0; - tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES; - } - + xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); + tp->t_ticket = NULL; + tp->t_log_res = 0; + tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES; undo_blocks: if (blocks > 0) { xfs_add_fdblocks(mp, blocks); @@ -241,6 +210,28 @@ undo_blocks: return error; } +static struct xfs_trans * +__xfs_trans_alloc( + struct xfs_mount *mp, + uint flags) +{ + struct xfs_trans *tp; + + ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) || xfs_has_lazysbcount(mp)); + + tp = kmem_cache_zalloc(xfs_trans_cache, GFP_KERNEL | __GFP_NOFAIL); + if (!(flags & XFS_TRANS_NO_WRITECOUNT)) + sb_start_intwrite(mp->m_super); + xfs_trans_set_context(tp); + tp->t_flags = flags; + tp->t_mountp = mp; + INIT_LIST_HEAD(&tp->t_items); + INIT_LIST_HEAD(&tp->t_busy); + INIT_LIST_HEAD(&tp->t_dfops); + tp->t_highest_agno = NULLAGNUMBER; + return tp; +} + int xfs_trans_alloc( struct xfs_mount *mp, @@ -254,33 +245,16 @@ xfs_trans_alloc( bool want_retry = true; int error; + ASSERT(resp->tr_logres > 0); + /* * Allocate the handle before we do our freeze accounting and setting up * GFP_NOFS allocation context so that we avoid lockdep false positives * by doing GFP_KERNEL allocations inside sb_start_intwrite(). */ retry: - tp = kmem_cache_zalloc(xfs_trans_cache, GFP_KERNEL | __GFP_NOFAIL); - if (!(flags & XFS_TRANS_NO_WRITECOUNT)) - sb_start_intwrite(mp->m_super); - xfs_trans_set_context(tp); - - /* - * Zero-reservation ("empty") transactions can't modify anything, so - * they're allowed to run while we're frozen. - */ - WARN_ON(resp->tr_logres > 0 && - mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); - ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) || - xfs_has_lazysbcount(mp)); - - tp->t_flags = flags; - tp->t_mountp = mp; - INIT_LIST_HEAD(&tp->t_items); - INIT_LIST_HEAD(&tp->t_busy); - INIT_LIST_HEAD(&tp->t_dfops); - tp->t_highest_agno = NULLAGNUMBER; - + WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); + tp = __xfs_trans_alloc(mp, flags); error = xfs_trans_reserve(tp, resp, blocks, rtextents); if (error == -ENOSPC && want_retry) { xfs_trans_cancel(tp); @@ -324,14 +298,11 @@ retry: * where we can be grabbing buffers at the same time that freeze is trying to * drain the buffer LRU list. */ -int +struct xfs_trans * xfs_trans_alloc_empty( - struct xfs_mount *mp, - struct xfs_trans **tpp) + struct xfs_mount *mp) { - struct xfs_trans_res resv = {0}; - - return xfs_trans_alloc(mp, &resv, 0, 0, XFS_TRANS_NO_WRITECOUNT, tpp); + return __xfs_trans_alloc(mp, XFS_TRANS_NO_WRITECOUNT); } /* @@ -742,8 +713,10 @@ xfs_trans_free_items( list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) { xfs_trans_del_item(lip); - if (abort) + if (abort) { + trace_xfs_trans_free_abort(lip); set_bit(XFS_LI_ABORTED, &lip->li_flags); + } if (lip->li_ops->iop_release) lip->li_ops->iop_release(lip); } @@ -1024,51 +997,57 @@ xfs_trans_cancel( } /* - * Roll from one trans in the sequence of PERMANENT transactions to - * the next: permanent transactions are only flushed out when - * committed with xfs_trans_commit(), but we still want as soon - * as possible to let chunks of it go to the log. So we commit the - * chunk we've been working on and get a new transaction to continue. + * Roll from one trans in the sequence of PERMANENT transactions to the next: + * permanent transactions are only flushed out when committed with + * xfs_trans_commit(), but we still want as soon as possible to let chunks of it + * go to the log. So we commit the chunk we've been working on and get a new + * transaction to continue. */ int xfs_trans_roll( struct xfs_trans **tpp) { - struct xfs_trans *trans = *tpp; - struct xfs_trans_res tres; + struct xfs_trans *tp = *tpp; + unsigned int log_res = tp->t_log_res; + unsigned int log_count = tp->t_log_count; int error; - trace_xfs_trans_roll(trans, _RET_IP_); + trace_xfs_trans_roll(tp, _RET_IP_); + + ASSERT(log_res > 0); /* * Copy the critical parameters from one trans to the next. */ - tres.tr_logres = trans->t_log_res; - tres.tr_logcount = trans->t_log_count; - - *tpp = xfs_trans_dup(trans); + *tpp = xfs_trans_dup(tp); /* * Commit the current transaction. - * If this commit failed, then it'd just unlock those items that - * are not marked ihold. That also means that a filesystem shutdown - * is in progress. The caller takes the responsibility to cancel - * the duplicate transaction that gets returned. + * + * If this commit failed, then it'd just unlock those items that are not + * marked ihold. That also means that a filesystem shutdown is in + * progress. The caller takes the responsibility to cancel the + * duplicate transaction that gets returned. */ - error = __xfs_trans_commit(trans, true); + error = __xfs_trans_commit(tp, true); if (error) return error; /* * Reserve space in the log for the next transaction. - * This also pushes items in the "AIL", the list of logged items, - * out to disk if they are taking up space at the tail of the log - * that we want to use. This requires that either nothing be locked - * across this call, or that anything that is locked be logged in - * the prior and the next transactions. + * + * This also pushes items in the AIL out to disk if they are taking up + * space at the tail of the log that we want to use. This requires that + * either nothing be locked across this call, or that anything that is + * locked be logged in the prior and the next transactions. */ - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - return xfs_trans_reserve(*tpp, &tres, 0, 0); + tp = *tpp; + error = xfs_log_regrant(tp->t_mountp, tp->t_ticket); + if (error) + return error; + tp->t_log_res = log_res; + tp->t_log_count = log_count; + return 0; } /* @@ -1144,9 +1123,18 @@ xfs_trans_reserve_more( unsigned int blocks, unsigned int rtextents) { - struct xfs_trans_res resv = { }; - - return xfs_trans_reserve(tp, &resv, blocks, rtextents); + bool rsvd = tp->t_flags & XFS_TRANS_RESERVE; + + if (blocks && xfs_dec_fdblocks(tp->t_mountp, blocks, rsvd)) + return -ENOSPC; + if (rtextents && xfs_dec_frextents(tp->t_mountp, rtextents)) { + if (blocks) + xfs_add_fdblocks(tp->t_mountp, blocks); + return -ENOSPC; + } + tp->t_blk_res += blocks; + tp->t_rtx_res += rtextents; + return 0; } /* @@ -1161,14 +1149,13 @@ xfs_trans_reserve_more_inode( unsigned int rblocks, bool force_quota) { - struct xfs_trans_res resv = { }; struct xfs_mount *mp = ip->i_mount; unsigned int rtx = xfs_extlen_to_rtxlen(mp, rblocks); int error; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve(tp, &resv, dblocks, rtx); + error = xfs_trans_reserve_more(tp, dblocks, rtx); if (error) return error; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 2b366851e9a4..7fb860f645a3 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -15,7 +15,6 @@ struct xfs_efd_log_item; struct xfs_efi_log_item; struct xfs_inode; struct xfs_item_ops; -struct xfs_log_iovec; struct xfs_mount; struct xfs_trans; struct xfs_trans_res; @@ -168,8 +167,7 @@ int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp, struct xfs_trans **tpp); int xfs_trans_reserve_more(struct xfs_trans *tp, unsigned int blocks, unsigned int rtextents); -int xfs_trans_alloc_empty(struct xfs_mount *mp, - struct xfs_trans **tpp); +struct xfs_trans *xfs_trans_alloc_empty(struct xfs_mount *mp); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); int xfs_trans_get_buf_map(struct xfs_trans *tp, struct xfs_buftarg *target, diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 0f641a9091ec..ac5cecec9aa1 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -243,7 +243,7 @@ __xfs_xattr_put_listent( offset = context->buffer + context->count; memcpy(offset, prefix, prefix_len); offset += prefix_len; - strncpy(offset, (char *)name, namelen); /* real name */ + memcpy(offset, (char *)name, namelen); /* real name */ offset += namelen; *offset = '\0'; diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index 80add26c0111..33f7eee521a8 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -434,7 +434,7 @@ xfs_init_open_zone( spin_lock_init(&oz->oz_alloc_lock); atomic_set(&oz->oz_ref, 1); oz->oz_rtg = rtg; - oz->oz_write_pointer = write_pointer; + oz->oz_allocated = write_pointer; oz->oz_written = write_pointer; oz->oz_write_hint = write_hint; oz->oz_is_gc = is_gc; @@ -569,7 +569,7 @@ xfs_try_use_zone( struct xfs_open_zone *oz, bool lowspace) { - if (oz->oz_write_pointer == rtg_blocks(oz->oz_rtg)) + if (oz->oz_allocated == rtg_blocks(oz->oz_rtg)) return false; if (!lowspace && !xfs_good_hint_match(oz, file_hint)) return false; @@ -654,13 +654,6 @@ static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip) !(ip->i_diflags & XFS_DIFLAG_APPEND); } -/* - * Pick a new zone for writes. - * - * If we aren't using up our budget of open zones just open a new one from the - * freelist. Else try to find one that matches the expected data lifetime. If - * we don't find one that is good pick any zone that is available. - */ static struct xfs_open_zone * xfs_select_zone_nowait( struct xfs_mount *mp, @@ -688,7 +681,8 @@ xfs_select_zone_nowait( goto out_unlock; /* - * See if we can open a new zone and use that. + * See if we can open a new zone and use that so that data for different + * files is mixed as little as possible. */ oz = xfs_try_open_zone(mp, write_hint); if (oz) @@ -727,7 +721,7 @@ xfs_select_zone( for (;;) { prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE); oz = xfs_select_zone_nowait(mp, write_hint, pack_tight); - if (oz) + if (oz || xfs_is_shutdown(mp)) break; schedule(); } @@ -744,25 +738,25 @@ xfs_zone_alloc_blocks( { struct xfs_rtgroup *rtg = oz->oz_rtg; struct xfs_mount *mp = rtg_mount(rtg); - xfs_rgblock_t rgbno; + xfs_rgblock_t allocated; spin_lock(&oz->oz_alloc_lock); count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN, - (xfs_filblks_t)rtg_blocks(rtg) - oz->oz_write_pointer); + (xfs_filblks_t)rtg_blocks(rtg) - oz->oz_allocated); if (!count_fsb) { spin_unlock(&oz->oz_alloc_lock); return 0; } - rgbno = oz->oz_write_pointer; - oz->oz_write_pointer += count_fsb; + allocated = oz->oz_allocated; + oz->oz_allocated += count_fsb; spin_unlock(&oz->oz_alloc_lock); - trace_xfs_zone_alloc_blocks(oz, rgbno, count_fsb); + trace_xfs_zone_alloc_blocks(oz, allocated, count_fsb); *sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *sector); if (!*is_seq) - *sector += XFS_FSB_TO_BB(mp, rgbno); + *sector += XFS_FSB_TO_BB(mp, allocated); return XFS_FSB_TO_B(mp, count_fsb); } @@ -777,26 +771,6 @@ xfs_mark_rtg_boundary( ioend->io_flags |= IOMAP_IOEND_BOUNDARY; } -static void -xfs_submit_zoned_bio( - struct iomap_ioend *ioend, - struct xfs_open_zone *oz, - bool is_seq) -{ - ioend->io_bio.bi_iter.bi_sector = ioend->io_sector; - ioend->io_private = oz; - atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */ - - if (is_seq) { - ioend->io_bio.bi_opf &= ~REQ_OP_WRITE; - ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND; - } else { - xfs_mark_rtg_boundary(ioend); - } - - submit_bio(&ioend->io_bio); -} - /* * Cache the last zone written to for an inode so that it is considered first * for subsequent writes. @@ -891,6 +865,26 @@ xfs_zone_cache_create_association( xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru); } +static void +xfs_submit_zoned_bio( + struct iomap_ioend *ioend, + struct xfs_open_zone *oz, + bool is_seq) +{ + ioend->io_bio.bi_iter.bi_sector = ioend->io_sector; + ioend->io_private = oz; + atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */ + + if (is_seq) { + ioend->io_bio.bi_opf &= ~REQ_OP_WRITE; + ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND; + } else { + xfs_mark_rtg_boundary(ioend); + } + + submit_bio(&ioend->io_bio); +} + void xfs_zone_alloc_and_submit( struct iomap_ioend *ioend, @@ -983,7 +977,7 @@ xfs_zone_rgbno_is_valid( lockdep_assert_held(&rtg_rmap(rtg)->i_lock); if (rtg->rtg_open_zone) - return rgbno < rtg->rtg_open_zone->oz_write_pointer; + return rgbno < rtg->rtg_open_zone->oz_allocated; return !xa_get_mark(&rtg_mount(rtg)->m_groups[XG_TYPE_RTG].xa, rtg_rgno(rtg), XFS_RTG_FREE); } @@ -1017,7 +1011,7 @@ xfs_init_zone( { struct xfs_mount *mp = rtg_mount(rtg); struct xfs_zone_info *zi = mp->m_zone_info; - uint64_t used = rtg_rmap(rtg)->i_used_blocks; + uint32_t used = rtg_rmap(rtg)->i_used_blocks; xfs_rgblock_t write_pointer, highest_rgbno; int error; @@ -1114,24 +1108,27 @@ xfs_get_zone_info_cb( } /* - * Calculate the max open zone limit based on the of number of - * backing zones available + * Calculate the max open zone limit based on the of number of backing zones + * available. */ static inline uint32_t xfs_max_open_zones( struct xfs_mount *mp) { unsigned int max_open, max_open_data_zones; + /* - * We need two zones for every open data zone, - * one in reserve as we don't reclaim open zones. One data zone - * and its spare is included in XFS_MIN_ZONES. + * We need two zones for every open data zone, one in reserve as we + * don't reclaim open zones. One data zone and its spare is included + * in XFS_MIN_ZONES to support at least one user data writer. */ max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1; max_open = max_open_data_zones + XFS_OPEN_GC_ZONES; /* - * Cap the max open limit to 1/4 of available space + * Cap the max open limit to 1/4 of available space. Without this we'd + * run out of easy reclaim targets too quickly and storage devices don't + * handle huge numbers of concurrent write streams overly well. */ max_open = min(max_open, mp->m_sb.sb_rgcount / 4); diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h index ecf39106704c..4db02816d0fd 100644 --- a/fs/xfs/xfs_zone_alloc.h +++ b/fs/xfs/xfs_zone_alloc.h @@ -23,9 +23,9 @@ struct xfs_zone_alloc_ctx { */ #define XFS_ZR_RESERVED (1U << 2) -int xfs_zoned_space_reserve(struct xfs_inode *ip, xfs_filblks_t count_fsb, +int xfs_zoned_space_reserve(struct xfs_mount *mp, xfs_filblks_t count_fsb, unsigned int flags, struct xfs_zone_alloc_ctx *ac); -void xfs_zoned_space_unreserve(struct xfs_inode *ip, +void xfs_zoned_space_unreserve(struct xfs_mount *mp, struct xfs_zone_alloc_ctx *ac); void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb); diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index d613a4094db6..064cd1a857a0 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -290,8 +290,6 @@ xfs_zone_gc_query_cb( return 0; } -#define cmp_int(l, r) ((l > r) - (l < r)) - static int xfs_zone_gc_rmap_rec_cmp( const void *a, @@ -330,10 +328,7 @@ xfs_zone_gc_query( iter->rec_idx = 0; iter->rec_count = 0; - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - return error; - + tp = xfs_trans_alloc_empty(mp); xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); cur = xfs_rtrmapbt_init_cursor(tp, rtg); error = xfs_rmap_query_range(cur, &ri_low, &ri_high, @@ -535,8 +530,7 @@ xfs_zone_gc_steal_open( spin_lock(&zi->zi_open_zones_lock); list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) { - if (!found || - oz->oz_write_pointer < found->oz_write_pointer) + if (!found || oz->oz_allocated < found->oz_allocated) found = oz; } @@ -586,7 +580,7 @@ xfs_zone_gc_ensure_target( { struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone; - if (!oz || oz->oz_write_pointer == rtg_blocks(oz->oz_rtg)) + if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg)) return xfs_zone_gc_select_target(mp); return oz; } @@ -607,7 +601,7 @@ xfs_zone_gc_space_available( oz = xfs_zone_gc_ensure_target(data->mp); if (!oz) return false; - return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) && + return oz->oz_allocated < rtg_blocks(oz->oz_rtg) && xfs_zone_gc_scratch_available(data); } @@ -649,7 +643,7 @@ xfs_zone_gc_alloc_blocks( */ spin_lock(&mp->m_sb_lock); *count_fsb = min(*count_fsb, - rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer); + rtg_blocks(oz->oz_rtg) - oz->oz_allocated); *count_fsb = min3(*count_fsb, mp->m_free[XC_FREE_RTEXTENTS].res_avail, mp->m_free[XC_FREE_RTAVAILABLE].res_avail); @@ -663,8 +657,8 @@ xfs_zone_gc_alloc_blocks( *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0); *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr); if (!*is_seq) - *daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer); - oz->oz_write_pointer += *count_fsb; + *daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated); + oz->oz_allocated += *count_fsb; atomic_inc(&oz->oz_ref); return oz; } diff --git a/fs/xfs/xfs_zone_info.c b/fs/xfs/xfs_zone_info.c index 733bcc2f8645..07e30c596975 100644 --- a/fs/xfs/xfs_zone_info.c +++ b/fs/xfs/xfs_zone_info.c @@ -32,7 +32,7 @@ xfs_show_open_zone( { seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s\n", rtg_rgno(oz->oz_rtg), - oz->oz_write_pointer, oz->oz_written, + oz->oz_allocated, oz->oz_written, rtg_rmap(oz->oz_rtg)->i_used_blocks, xfs_write_hint_to_str(oz->oz_write_hint)); } diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h index ab696975a993..35e6de3d25ed 100644 --- a/fs/xfs/xfs_zone_priv.h +++ b/fs/xfs/xfs_zone_priv.h @@ -11,18 +11,18 @@ struct xfs_open_zone { atomic_t oz_ref; /* - * oz_write_pointer is the write pointer at which space is handed out - * for conventional zones, or simple the count of blocks handed out - * so far for sequential write required zones and is protected by - * oz_alloc_lock/ + * oz_allocated is the amount of space already allocated out of the zone + * and is protected by oz_alloc_lock. + * + * For conventional zones it also is the offset of the next write. */ spinlock_t oz_alloc_lock; - xfs_rgblock_t oz_write_pointer; + xfs_rgblock_t oz_allocated; /* - * oz_written is the number of blocks for which we've received a - * write completion. oz_written must always be <= oz_write_pointer - * and is protected by the ILOCK of the rmap inode. + * oz_written is the number of blocks for which we've received a write + * completion. oz_written must always be <= oz_allocated and is + * protected by the ILOCK of the rmap inode. */ xfs_rgblock_t oz_written; diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c index 93c9a7721139..1313c55b8cbe 100644 --- a/fs/xfs/xfs_zone_space_resv.c +++ b/fs/xfs/xfs_zone_space_resv.c @@ -117,11 +117,10 @@ xfs_zoned_space_wait_error( static int xfs_zoned_reserve_available( - struct xfs_inode *ip, + struct xfs_mount *mp, xfs_filblks_t count_fsb, unsigned int flags) { - struct xfs_mount *mp = ip->i_mount; struct xfs_zone_info *zi = mp->m_zone_info; struct xfs_zone_reservation reservation = { .task = current, @@ -198,11 +197,10 @@ xfs_zoned_reserve_available( */ static int xfs_zoned_reserve_extents_greedy( - struct xfs_inode *ip, + struct xfs_mount *mp, xfs_filblks_t *count_fsb, unsigned int flags) { - struct xfs_mount *mp = ip->i_mount; struct xfs_zone_info *zi = mp->m_zone_info; s64 len = *count_fsb; int error = -ENOSPC; @@ -220,12 +218,11 @@ xfs_zoned_reserve_extents_greedy( int xfs_zoned_space_reserve( - struct xfs_inode *ip, + struct xfs_mount *mp, xfs_filblks_t count_fsb, unsigned int flags, struct xfs_zone_alloc_ctx *ac) { - struct xfs_mount *mp = ip->i_mount; int error; ASSERT(ac->reserved_blocks == 0); @@ -234,11 +231,11 @@ xfs_zoned_space_reserve( error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb, flags & XFS_ZR_RESERVED); if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1) - error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags); + error = xfs_zoned_reserve_extents_greedy(mp, &count_fsb, flags); if (error) return error; - error = xfs_zoned_reserve_available(ip, count_fsb, flags); + error = xfs_zoned_reserve_available(mp, count_fsb, flags); if (error) { xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb); return error; @@ -249,12 +246,10 @@ xfs_zoned_space_reserve( void xfs_zoned_space_unreserve( - struct xfs_inode *ip, + struct xfs_mount *mp, struct xfs_zone_alloc_ctx *ac) { if (ac->reserved_blocks > 0) { - struct xfs_mount *mp = ip->i_mount; - xfs_zoned_add_available(mp, ac->reserved_blocks); xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks); } diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 42e2c0065bb3..fd3a5922f6c3 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -124,37 +124,46 @@ static void zonefs_readahead(struct readahead_control *rac) * Map blocks for page writeback. This is used only on conventional zone files, * which implies that the page range can only be within the fixed inode size. */ -static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, - struct inode *inode, loff_t offset, - unsigned int len) +static ssize_t zonefs_writeback_range(struct iomap_writepage_ctx *wpc, + struct folio *folio, u64 offset, unsigned len, u64 end_pos) { - struct zonefs_zone *z = zonefs_inode_zone(inode); + struct zonefs_zone *z = zonefs_inode_zone(wpc->inode); if (WARN_ON_ONCE(zonefs_zone_is_seq(z))) return -EIO; - if (WARN_ON_ONCE(offset >= i_size_read(inode))) + if (WARN_ON_ONCE(offset >= i_size_read(wpc->inode))) return -EIO; /* If the mapping is already OK, nothing needs to be done */ - if (offset >= wpc->iomap.offset && - offset < wpc->iomap.offset + wpc->iomap.length) - return 0; + if (offset < wpc->iomap.offset || + offset >= wpc->iomap.offset + wpc->iomap.length) { + int error; + + error = zonefs_write_iomap_begin(wpc->inode, offset, + z->z_capacity - offset, IOMAP_WRITE, + &wpc->iomap, NULL); + if (error) + return error; + } - return zonefs_write_iomap_begin(inode, offset, - z->z_capacity - offset, - IOMAP_WRITE, &wpc->iomap, NULL); + return iomap_add_to_ioend(wpc, folio, offset, end_pos, len); } static const struct iomap_writeback_ops zonefs_writeback_ops = { - .map_blocks = zonefs_write_map_blocks, + .writeback_range = zonefs_writeback_range, + .writeback_submit = iomap_ioend_writeback_submit, }; static int zonefs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct iomap_writepage_ctx wpc = { }; + struct iomap_writepage_ctx wpc = { + .inode = mapping->host, + .wbc = wbc, + .ops = &zonefs_writeback_ops, + }; - return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops); + return iomap_writepages(&wpc); } static int zonefs_swap_activate(struct swap_info_struct *sis, @@ -312,8 +321,10 @@ static const struct vm_operations_struct zonefs_file_vm_ops = { .page_mkwrite = zonefs_filemap_page_mkwrite, }; -static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma) +static int zonefs_file_mmap_prepare(struct vm_area_desc *desc) { + struct file *file = desc->file; + /* * Conventional zones accept random writes, so their files can support * shared writable mappings. For sequential zone files, only read @@ -321,11 +332,11 @@ static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma) * ordering between msync() and page cache writeback. */ if (zonefs_inode_is_seq(file_inode(file)) && - (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + (desc->vm_flags & VM_SHARED) && (desc->vm_flags & VM_MAYWRITE)) return -EINVAL; file_accessed(file); - vma->vm_ops = &zonefs_file_vm_ops; + desc->vm_ops = &zonefs_file_vm_ops; return 0; } @@ -563,7 +574,8 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, if (ret <= 0) goto inode_unlock; - ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops, NULL); + ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops, + NULL, NULL); if (ret == -EIO) zonefs_io_error(inode, true); @@ -850,7 +862,7 @@ const struct file_operations zonefs_file_operations = { .open = zonefs_file_open, .release = zonefs_file_release, .fsync = zonefs_file_fsync, - .mmap = zonefs_file_mmap, + .mmap_prepare = zonefs_file_mmap_prepare, .llseek = zonefs_file_llseek, .read_iter = zonefs_file_read_iter, .write_iter = zonefs_file_write_iter, diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index d165eb979f21..4dc7f967c861 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1113,11 +1113,12 @@ static int zonefs_read_super(struct super_block *sb) u32 crc, stored_crc; int ret; - super = kmalloc(PAGE_SIZE, GFP_KERNEL); + super = kmalloc(ZONEFS_SUPER_SIZE, GFP_KERNEL); if (!super) return -ENOMEM; - ret = bdev_rw_virt(sb->s_bdev, 0, super, PAGE_SIZE, REQ_OP_READ); + ret = bdev_rw_virt(sb->s_bdev, 0, super, ZONEFS_SUPER_SIZE, + REQ_OP_READ); if (ret) goto free_super; |