diff options
Diffstat (limited to 'fs')
627 files changed, 19260 insertions, 9551 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index cebba4eaa0b5..12c0ae29f185 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -68,6 +68,8 @@ void v9fs_cache_inode_get_cookie(struct inode *inode) &path, sizeof(path), &version, sizeof(version), i_size_read(&v9inode->netfs.inode)); + if (v9inode->netfs.cache) + mapping_set_release_always(inode->i_mapping); p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", inode, v9fs_inode_cookie(v9inode)); diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 950cf61f118b..0d28ecf668d0 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -260,7 +260,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); inode->i_blocks = 0; inode->i_rdev = rdev; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); inode->i_mapping->a_ops = &v9fs_addr_operations; inode->i_private = NULL; @@ -1011,7 +1011,7 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct path *path, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } else if (v9ses->cache & CACHE_WRITEBACK) { if (S_ISREG(inode->i_mode)) { @@ -1032,7 +1032,7 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct path *path, return PTR_ERR(st); v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0); - generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat); p9stat_free(st); kfree(st); @@ -1152,7 +1152,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, inode->i_atime.tv_sec = stat->atime; inode->i_mtime.tv_sec = stat->mtime; - inode->i_ctime.tv_sec = stat->mtime; + inode_set_ctime(inode, stat->mtime, 0); inode->i_uid = v9ses->dfltuid; inode->i_gid = v9ses->dfltgid; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 14510872ecc3..1312f68965ac 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -450,7 +450,7 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } else if (v9ses->cache) { if (S_ISREG(inode->i_mode)) { @@ -475,7 +475,7 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap, return PTR_ERR(st); v9fs_stat2inode_dotl(st, d_inode(dentry), 0); - generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat); /* Change block size to what the server returned */ stat->blksize = st->st_blksize; @@ -645,8 +645,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, inode->i_atime.tv_nsec = stat->st_atime_nsec; inode->i_mtime.tv_sec = stat->st_mtime_sec; inode->i_mtime.tv_nsec = stat->st_mtime_nsec; - inode->i_ctime.tv_sec = stat->st_ctime_sec; - inode->i_ctime.tv_nsec = stat->st_ctime_nsec; + inode_set_ctime(inode, stat->st_ctime_sec, + stat->st_ctime_nsec); inode->i_uid = stat->st_uid; inode->i_gid = stat->st_gid; set_nlink(inode, stat->st_nlink); @@ -668,8 +668,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, inode->i_mtime.tv_nsec = stat->st_mtime_nsec; } if (stat->st_result_mask & P9_STATS_CTIME) { - inode->i_ctime.tv_sec = stat->st_ctime_sec; - inode->i_ctime.tv_nsec = stat->st_ctime_nsec; + inode_set_ctime(inode, stat->st_ctime_sec, + stat->st_ctime_nsec); } if (stat->st_result_mask & P9_STATS_UID) inode->i_uid = stat->st_uid; diff --git a/fs/Kconfig b/fs/Kconfig index 18d034ec7953..aa7e03cc1941 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -18,8 +18,12 @@ config VALIDATE_FS_PARSER config FS_IOMAP bool +config BUFFER_HEAD + bool + # old blockdev_direct_IO implementation. Use iomap for new code instead config LEGACY_DIRECT_IO + depends on BUFFER_HEAD bool if BLOCK @@ -169,6 +173,7 @@ source "fs/sysfs/Kconfig" config TMPFS bool "Tmpfs virtual memory file system support (former shm fs)" depends on SHMEM + select MEMFD_CREATE help Tmpfs is a file system which keeps all files in virtual memory. @@ -205,8 +210,8 @@ config TMPFS_XATTR Extended attributes are name:value pairs associated with inodes by the kernel or by users (see the attr(5) manual page for details). - Currently this enables support for the trusted.* and - security.* namespaces. + This enables support for the trusted.*, security.* and user.* + namespaces. You need this for POSIX ACL support on tmpfs. @@ -233,6 +238,18 @@ config TMPFS_INODE64 If unsure, say N. +config TMPFS_QUOTA + bool "Tmpfs quota support" + depends on TMPFS + select QUOTA + help + Quota support allows to set per user and group limits for tmpfs + usage. Say Y to enable quota support. Once enabled you can control + user and group quota enforcement with quota, usrquota and grpquota + mount options. + + If unsure, say N. + config ARCH_SUPPORTS_HUGETLBFS def_bool n @@ -240,6 +257,7 @@ config HUGETLBFS bool "HugeTLB file system support" depends on X86 || IA64 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN depends on (SYSFS || SYSCTL) + select MEMFD_CREATE help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read @@ -252,7 +270,7 @@ config HUGETLB_PAGE config HUGETLB_PAGE_OPTIMIZE_VMEMMAP def_bool HUGETLB_PAGE - depends on ARCH_WANT_OPTIMIZE_VMEMMAP + depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP depends on SPARSEMEM_VMEMMAP config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON @@ -264,9 +282,6 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off (boot command line) or hugetlb_optimize_vmemmap (sysctl). -config MEMFD_CREATE - def_bool TMPFS || HUGETLBFS - config ARCH_HAS_GIGANTIC_PAGE bool diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 93539aac0e5b..f5693164ca9a 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -58,7 +58,7 @@ config ARCH_USE_GNU_PROPERTY config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y if !BINFMT_ELF - depends on ARM || ((M68K || SUPERH || XTENSA) && !MMU) + depends on ARM || ((M68K || RISCV || SUPERH || XTENSA) && !MMU) select ELFCORE help ELF FDPIC binaries are based on ELF, but allow the individual load diff --git a/fs/Makefile b/fs/Makefile index e513aaee0603..f9541f40be4e 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -17,7 +17,7 @@ obj-y := open.o read_write.o file_table.o super.o \ fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ kernel_read_file.o mnt_idmapping.o remap_range.o -obj-$(CONFIG_BLOCK) += buffer.o mpage.o +obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o obj-$(CONFIG_PROC_FS) += proc_namespace.o obj-$(CONFIG_LEGACY_DIRECT_IO) += direct-io.o obj-y += notify/ diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig index 44738fed6625..1b97058f0c4a 100644 --- a/fs/adfs/Kconfig +++ b/fs/adfs/Kconfig @@ -2,6 +2,7 @@ config ADFS_FS tristate "ADFS file system support" depends on BLOCK + select BUFFER_HEAD help The Acorn Disc Filing System is the standard file system of the RiscOS operating system which runs on Acorn's ARM-based Risc PC diff --git a/fs/adfs/dir_f.h b/fs/adfs/dir_f.h index a5393e6cf9f4..4e6c53d59ebd 100644 --- a/fs/adfs/dir_f.h +++ b/fs/adfs/dir_f.h @@ -58,9 +58,4 @@ struct adfs_newdirtail { __u8 dircheckbyte; } __attribute__((packed)); -union adfs_dirtail { - struct adfs_olddirtail old; - struct adfs_newdirtail new; -}; - #endif diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index c3ac613d0975..20963002578a 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -270,7 +270,7 @@ adfs_iget(struct super_block *sb, struct object_info *obj) inode->i_mode = adfs_atts2mode(sb, inode); adfs_adfs2unix_time(&inode->i_mtime, inode); inode->i_atime = inode->i_mtime; - inode->i_ctime = inode->i_mtime; + inode_set_ctime_to_ts(inode, inode->i_mtime); if (S_ISDIR(inode->i_mode)) { inode->i_op = &adfs_dir_inode_operations; @@ -331,7 +331,7 @@ adfs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, if (ia_valid & ATTR_ATIME) inode->i_atime = attr->ia_atime; if (ia_valid & ATTR_CTIME) - inode->i_ctime = attr->ia_ctime; + inode_set_ctime_to_ts(inode, attr->ia_ctime); if (ia_valid & ATTR_MODE) { ADFS_I(inode)->attr = adfs_mode2atts(sb, inode, attr->ia_mode); inode->i_mode = adfs_atts2mode(sb, inode); diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig index 962b86374e1c..1ae432d266c3 100644 --- a/fs/affs/Kconfig +++ b/fs/affs/Kconfig @@ -2,6 +2,7 @@ config AFFS_FS tristate "Amiga FFS file system support" depends on BLOCK + select BUFFER_HEAD select LEGACY_DIRECT_IO help The Fast File System (FFS) is the common file system used on hard diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 29f11e10a7c7..7ba93efc1143 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -60,7 +60,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh) mark_buffer_dirty_inode(dir_bh, dir); affs_brelse(dir_bh); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); inode_inc_iversion(dir); mark_inode_dirty(dir); @@ -114,7 +114,7 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh) affs_brelse(bh); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); inode_inc_iversion(dir); mark_inode_dirty(dir); @@ -315,7 +315,7 @@ affs_remove_header(struct dentry *dentry) else clear_nlink(inode); affs_unlock_link(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); done: diff --git a/fs/affs/file.c b/fs/affs/file.c index e43f2f007ac1..04c018e19602 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -15,6 +15,7 @@ #include <linux/uio.h> #include <linux/blkdev.h> +#include <linux/mpage.h> #include "affs.h" static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext); @@ -370,9 +371,10 @@ err_alloc: return -ENOSPC; } -static int affs_writepage(struct page *page, struct writeback_control *wbc) +static int affs_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - return block_write_full_page(page, affs_get_block, wbc); + return mpage_writepages(mapping, wbc, affs_get_block); } static int affs_read_folio(struct file *file, struct folio *folio) @@ -456,10 +458,11 @@ const struct address_space_operations affs_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = affs_read_folio, - .writepage = affs_writepage, + .writepages = affs_writepages, .write_begin = affs_write_begin, .write_end = affs_write_end, .direct_IO = affs_direct_IO, + .migrate_folio = buffer_migrate_folio, .bmap = _affs_bmap }; @@ -520,21 +523,20 @@ affs_getemptyblk_ino(struct inode *inode, int block) return ERR_PTR(err); } -static int -affs_do_readpage_ofs(struct page *page, unsigned to, int create) +static int affs_do_read_folio_ofs(struct folio *folio, size_t to, int create) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct super_block *sb = inode->i_sb; struct buffer_head *bh; - unsigned pos = 0; - u32 bidx, boff, bsize; + size_t pos = 0; + size_t bidx, boff, bsize; u32 tmp; - pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino, - page->index, to); - BUG_ON(to > PAGE_SIZE); + pr_debug("%s(%lu, %ld, 0, %zu)\n", __func__, inode->i_ino, + folio->index, to); + BUG_ON(to > folio_size(folio)); bsize = AFFS_SB(sb)->s_data_blksize; - tmp = page->index << PAGE_SHIFT; + tmp = folio_pos(folio); bidx = tmp / bsize; boff = tmp % bsize; @@ -544,7 +546,7 @@ affs_do_readpage_ofs(struct page *page, unsigned to, int create) return PTR_ERR(bh); tmp = min(bsize - boff, to - pos); BUG_ON(pos + tmp > to || tmp > bsize); - memcpy_to_page(page, pos, AFFS_DATA(bh) + boff, tmp); + memcpy_to_folio(folio, pos, AFFS_DATA(bh) + boff, tmp); affs_brelse(bh); bidx++; pos += tmp; @@ -624,25 +626,23 @@ out: return PTR_ERR(bh); } -static int -affs_read_folio_ofs(struct file *file, struct folio *folio) +static int affs_read_folio_ofs(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; - u32 to; + struct inode *inode = folio->mapping->host; + size_t to; int err; - pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, page->index); - to = PAGE_SIZE; - if (((page->index + 1) << PAGE_SHIFT) > inode->i_size) { - to = inode->i_size & ~PAGE_MASK; - memset(page_address(page) + to, 0, PAGE_SIZE - to); + pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, folio->index); + to = folio_size(folio); + if (folio_pos(folio) + to > inode->i_size) { + to = inode->i_size - folio_pos(folio); + folio_zero_segment(folio, to, folio_size(folio)); } - err = affs_do_readpage_ofs(page, to, 0); + err = affs_do_read_folio_ofs(folio, to, 0); if (!err) - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); return err; } @@ -651,7 +651,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; - struct page *page; + struct folio *folio; pgoff_t index; int err = 0; @@ -667,19 +667,20 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping } index = pos >> PAGE_SHIFT; - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; - *pagep = page; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); + *pagep = &folio->page; - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) return 0; /* XXX: inefficient but safe in the face of short writes */ - err = affs_do_readpage_ofs(page, PAGE_SIZE, 1); + err = affs_do_read_folio_ofs(folio, folio_size(folio), 1); if (err) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } return err; } @@ -688,6 +689,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; struct buffer_head *bh, *prev_bh; @@ -701,18 +703,18 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, to = from + len; /* * XXX: not sure if this can handle short copies (len < copied), but - * we don't have to, because the page should always be uptodate here, + * we don't have to, because the folio should always be uptodate here, * due to write_begin. */ pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos, pos + len); bsize = AFFS_SB(sb)->s_data_blksize; - data = page_address(page); + data = folio_address(folio); bh = NULL; written = 0; - tmp = (page->index << PAGE_SHIFT) + from; + tmp = (folio->index << PAGE_SHIFT) + from; bidx = tmp / bsize; boff = tmp % bsize; if (boff) { @@ -804,11 +806,11 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, from += tmp; bidx++; } - SetPageUptodate(page); + folio_mark_uptodate(folio); done: affs_brelse(bh); - tmp = (page->index << PAGE_SHIFT) + from; + tmp = (folio->index << PAGE_SHIFT) + from; if (tmp > inode->i_size) inode->i_size = AFFS_I(inode)->mmu_private = tmp; @@ -819,8 +821,8 @@ done: } err_first_bh: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return written; @@ -835,9 +837,10 @@ const struct address_space_operations affs_aops_ofs = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = affs_read_folio_ofs, - //.writepage = affs_writepage_ofs, + //.writepages = affs_writepages_ofs, .write_begin = affs_write_begin_ofs, - .write_end = affs_write_end_ofs + .write_end = affs_write_end_ofs, + .migrate_folio = filemap_migrate_folio, }; /* Free any preallocated blocks. */ diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 27f77a52c5c8..060746c63151 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -149,13 +149,13 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) break; } - inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec - = (be32_to_cpu(tail->change.days) * 86400LL + - be32_to_cpu(tail->change.mins) * 60 + - be32_to_cpu(tail->change.ticks) / 50 + - AFFS_EPOCH_DELTA) + - sys_tz.tz_minuteswest * 60; - inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_atime.tv_nsec = 0; + inode->i_mtime.tv_sec = inode->i_atime.tv_sec = + inode_set_ctime(inode, + (be32_to_cpu(tail->change.days) * 86400LL + + be32_to_cpu(tail->change.mins) * 60 + + be32_to_cpu(tail->change.ticks) / 50 + AFFS_EPOCH_DELTA) + + sys_tz.tz_minuteswest * 60, 0).tv_sec; + inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = 0; affs_brelse(bh); unlock_new_inode(inode); return inode; @@ -314,7 +314,7 @@ affs_new_inode(struct inode *dir) inode->i_gid = current_fsgid(); inode->i_ino = block; set_nlink(inode, 1); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); atomic_set(&AFFS_I(inode)->i_opencnt, 0); AFFS_I(inode)->i_blkcnt = 0; AFFS_I(inode)->i_lc = NULL; diff --git a/fs/affs/namei.c b/fs/affs/namei.c index d12ccfd2a83d..2fe4a5832fcf 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -43,7 +43,7 @@ affs_get_toupper(struct super_block *sb) * Note: the dentry argument is the parent dentry. */ static inline int -__affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr, toupper_t toupper, bool notruncate) +__affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr, toupper_t fn, bool notruncate) { const u8 *name = qstr->name; unsigned long hash; @@ -57,7 +57,7 @@ __affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr, toupper_t tou hash = init_name_hash(dentry); len = min(qstr->len, AFFSNAMEMAX); for (; len > 0; name++, len--) - hash = partial_name_hash(toupper(*name), hash); + hash = partial_name_hash(fn(*name), hash); qstr->hash = end_name_hash(hash); return 0; @@ -80,7 +80,7 @@ affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr) } static inline int __affs_compare_dentry(unsigned int len, - const char *str, const struct qstr *name, toupper_t toupper, + const char *str, const struct qstr *name, toupper_t fn, bool notruncate) { const u8 *aname = str; @@ -106,7 +106,7 @@ static inline int __affs_compare_dentry(unsigned int len, return 1; for (; len > 0; len--) - if (toupper(*aname++) != toupper(*bname++)) + if (fn(*aname++) != fn(*bname++)) return 1; return 0; @@ -135,7 +135,7 @@ affs_intl_compare_dentry(const struct dentry *dentry, */ static inline int -affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper) +affs_match(struct dentry *dentry, const u8 *name2, toupper_t fn) { const u8 *name = dentry->d_name.name; int len = dentry->d_name.len; @@ -148,7 +148,7 @@ affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper) return 0; for (name2++; len > 0; len--) - if (toupper(*name++) != toupper(*name2++)) + if (fn(*name++) != fn(*name2++)) return 0; return 1; } @@ -156,12 +156,12 @@ affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper) int affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len) { - toupper_t toupper = affs_get_toupper(sb); + toupper_t fn = affs_get_toupper(sb); u32 hash; hash = len = min(len, AFFSNAMEMAX); for (; len > 0; len--) - hash = (hash * 13 + toupper(*name++)) & 0x7ff; + hash = (hash * 13 + fn(*name++)) & 0x7ff; return hash % AFFS_SB(sb)->s_hashsize; } @@ -171,7 +171,7 @@ affs_find_entry(struct inode *dir, struct dentry *dentry) { struct super_block *sb = dir->i_sb; struct buffer_head *bh; - toupper_t toupper = affs_get_toupper(sb); + toupper_t fn = affs_get_toupper(sb); u32 key; pr_debug("%s(\"%pd\")\n", __func__, dentry); @@ -189,7 +189,7 @@ affs_find_entry(struct inode *dir, struct dentry *dentry) bh = affs_bread(sb, key); if (!bh) return ERR_PTR(-EIO); - if (affs_match(dentry, AFFS_TAIL(sb, bh)->name, toupper)) + if (affs_match(dentry, AFFS_TAIL(sb, bh)->name, fn)) return bh; key = be32_to_cpu(AFFS_TAIL(sb, bh)->hash_chain); } diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c index 31d6446dc166..094aec8d17b8 100644 --- a/fs/affs/symlink.c +++ b/fs/affs/symlink.c @@ -13,10 +13,9 @@ static int affs_symlink_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; struct buffer_head *bh; - struct inode *inode = page->mapping->host; - char *link = page_address(page); + struct inode *inode = folio->mapping->host; + char *link = folio_address(folio); struct slink_front *lf; int i, j; char c; @@ -58,12 +57,11 @@ static int affs_symlink_read_folio(struct file *file, struct folio *folio) } link[i] = '\0'; affs_brelse(bh); - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); return 0; fail: - SetPageError(page); - unlock_page(page); + folio_unlock(folio); return -EIO; } diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index d7d9402ff718..95bcbd7654d1 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -88,7 +88,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) set_nlink(inode, 2); inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; - inode->i_ctime = inode->i_atime = inode->i_mtime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); inode->i_blocks = 0; inode->i_generation = 0; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 866bab860a88..1c794a1896aa 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -90,7 +90,7 @@ static int afs_inode_init_from_status(struct afs_operation *op, vnode->status = *status; t = status->mtime_client; - inode->i_ctime = t; + inode_set_ctime_to_ts(inode, t); inode->i_mtime = t; inode->i_atime = t; inode->i_flags |= S_NOATIME; @@ -206,7 +206,7 @@ static void afs_apply_status(struct afs_operation *op, t = status->mtime_client; inode->i_mtime = t; if (vp->update_ctime) - inode->i_ctime = op->ctime; + inode_set_ctime_to_ts(inode, op->ctime); if (vnode->status.data_version != status->data_version) data_changed = true; @@ -252,7 +252,7 @@ static void afs_apply_status(struct afs_operation *op, vnode->netfs.remote_i_size = status->size; if (change_size) { afs_set_i_size(vnode, status->size); - inode->i_ctime = t; + inode_set_ctime_to_ts(inode, t); inode->i_atime = t; } } @@ -773,7 +773,7 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path, do { read_seqbegin_or_lock(&vnode->cb_lock, &seq); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) && stat->nlink > 0) stat->nlink -= 1; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 9d3d64921106..da73b97e19a9 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -681,6 +681,8 @@ static inline void afs_vnode_set_cache(struct afs_vnode *vnode, { #ifdef CONFIG_AFS_FSCACHE vnode->netfs.cache = cookie; + if (cookie) + mapping_set_release_always(vnode->netfs.inode.i_mapping); #endif } @@ -80,7 +80,7 @@ struct aio_ring { struct kioctx_table { struct rcu_head rcu; unsigned nr; - struct kioctx __rcu *table[]; + struct kioctx __rcu *table[] __counted_by(nr); }; struct kioctx_cpu { @@ -558,7 +558,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) ctx->mmap_base = do_mmap(ctx->aio_ring_file, 0, ctx->mmap_size, PROT_READ | PROT_WRITE, - MAP_SHARED, 0, &unused, NULL); + MAP_SHARED, 0, 0, &unused, NULL); mmap_write_unlock(mm); if (IS_ERR((void *)ctx->mmap_base)) { ctx->mmap_size = 0; @@ -1447,13 +1447,8 @@ static void aio_complete_rw(struct kiocb *kiocb, long res) if (kiocb->ki_flags & IOCB_WRITE) { struct inode *inode = file_inode(kiocb->ki_filp); - /* - * Tell lockdep we inherited freeze protection from submission - * thread. - */ if (S_ISREG(inode->i_mode)) - __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); - file_end_write(kiocb->ki_filp); + kiocb_end_write(kiocb); } iocb->ki_res.res = res; @@ -1581,17 +1576,8 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, return ret; ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) { - /* - * Open-code file_start_write here to grab freeze protection, - * which will be released by another thread in - * aio_complete_rw(). Fool lockdep by telling it the lock got - * released so that it doesn't complain about the held lock when - * we return to userspace. - */ - if (S_ISREG(file_inode(file)->i_mode)) { - sb_start_write(file_inode(file)->i_sb); - __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); - } + if (S_ISREG(file_inode(file)->i_mode)) + kiocb_start_write(req); req->ki_flags |= IOCB_WRITE; aio_rw_done(req, call_write_iter(file, req, &iter)); } diff --git a/fs/attr.c b/fs/attr.c index d60dc1edb526..a8ae5f6d9b16 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -312,7 +312,7 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode, if (ia_valid & ATTR_MTIME) inode->i_mtime = attr->ia_mtime; if (ia_valid & ATTR_CTIME) - inode->i_ctime = attr->ia_ctime; + inode_set_ctime_to_ts(inode, attr->ia_ctime); if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; if (!in_group_or_capable(idmap, inode, @@ -394,9 +394,25 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, return error; if ((ia_valid & ATTR_MODE)) { - umode_t amode = attr->ia_mode; + /* + * Don't allow changing the mode of symlinks: + * + * (1) The vfs doesn't take the mode of symlinks into account + * during permission checking. + * (2) This has never worked correctly. Most major filesystems + * did return EOPNOTSUPP due to interactions with POSIX ACLs + * but did still updated the mode of the symlink. + * This inconsistency led system call wrapper providers such + * as libc to block changing the mode of symlinks with + * EOPNOTSUPP already. + * (3) To even do this in the first place one would have to use + * specific file descriptors and quite some effort. + */ + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + /* Flag setting protected by i_mutex */ - if (is_sxid(amode)) + if (is_sxid(attr->ia_mode)) inode->i_flags &= ~S_NOSEC; } diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index affa70360b1f..2b49662ed237 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -370,7 +370,7 @@ struct inode *autofs_get_inode(struct super_block *sb, umode_t mode) inode->i_uid = d_inode(sb->s_root)->i_uid; inode->i_gid = d_inode(sb->s_root)->i_gid; } - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); inode->i_ino = get_next_ino(); if (S_ISDIR(mode)) { diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 93046c9dc461..512b9a26c63d 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -600,7 +600,7 @@ static int autofs_dir_symlink(struct mnt_idmap *idmap, p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); return 0; } @@ -633,7 +633,7 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) d_inode(dentry)->i_size = 0; clear_nlink(d_inode(dentry)); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); spin_lock(&sbi->lookup_lock); __autofs_add_expiring(dentry); @@ -749,7 +749,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap, p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; inc_nlink(dir); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); return 0; } diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c index 54c1f8b8b075..33dd4660d82f 100644 --- a/fs/autofs/waitq.c +++ b/fs/autofs/waitq.c @@ -32,8 +32,9 @@ void autofs_catatonic_mode(struct autofs_sb_info *sbi) wq->status = -ENOENT; /* Magic is gone - report failure */ kfree(wq->name.name - wq->offset); wq->name.name = NULL; - wq->wait_ctr--; - wake_up_interruptible(&wq->queue); + wake_up(&wq->queue); + if (!--wq->wait_ctr) + kfree(wq); wq = nwq; } fput(sbi->pipe); /* Close the pipe */ diff --git a/fs/bad_inode.c b/fs/bad_inode.c index db649487d58c..83f9566c973b 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -133,8 +133,7 @@ static int bad_inode_fiemap(struct inode *inode, return -EIO; } -static int bad_inode_update_time(struct inode *inode, struct timespec64 *time, - int flags) +static int bad_inode_update_time(struct inode *inode, int flags) { return -EIO; } @@ -209,8 +208,7 @@ void make_bad_inode(struct inode *inode) remove_inode_hash(inode); inode->i_mode = S_IFREG; - inode->i_atime = inode->i_mtime = inode->i_ctime = - current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); inode->i_op = &bad_inode_ops; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &bad_file_ops; diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig index 9550b6462b81..5fcfc4024ffe 100644 --- a/fs/befs/Kconfig +++ b/fs/befs/Kconfig @@ -2,6 +2,7 @@ config BEFS_FS tristate "BeOS file system (BeFS) support (read only)" depends on BLOCK + select BUFFER_HEAD select NLS help The BeOS File System (BeFS) is the native file system of Be, Inc's diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index eee9237386e2..9a16a51fbb88 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -363,7 +363,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) inode->i_mtime.tv_sec = fs64_to_cpu(sb, raw_inode->last_modified_time) >> 16; inode->i_mtime.tv_nsec = 0; /* lower 16 bits are not a time */ - inode->i_ctime = inode->i_mtime; + inode_set_ctime_to_ts(inode, inode->i_mtime); inode->i_atime = inode->i_mtime; befs_ino->i_inode_num = fsrun_to_cpu(sb, raw_inode->inode_num); diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig index 3a757805b585..8e7ef866b62a 100644 --- a/fs/bfs/Kconfig +++ b/fs/bfs/Kconfig @@ -2,6 +2,7 @@ config BFS_FS tristate "BFS file system support" depends on BLOCK + select BUFFER_HEAD help Boot File System (BFS) is a file system used under SCO UnixWare to allow the bootloader access to the kernel image and other important diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 040d5140e426..12b8af04dcb3 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -97,7 +97,7 @@ static int bfs_create(struct mnt_idmap *idmap, struct inode *dir, set_bit(ino, info->si_imap); info->si_freei--; inode_init_owner(&nop_mnt_idmap, inode, dir, mode); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_blocks = 0; inode->i_op = &bfs_file_inops; inode->i_fop = &bfs_file_operations; @@ -158,7 +158,7 @@ static int bfs_link(struct dentry *old, struct inode *dir, return err; } inc_nlink(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); ihold(inode); d_instantiate(new, inode); @@ -187,9 +187,9 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry) } de->ino = 0; mark_buffer_dirty_inode(bh, dir); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); error = 0; @@ -240,10 +240,10 @@ static int bfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto end_rename; } old_de->ino = 0; - old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); + old_dir->i_mtime = inode_set_ctime_current(old_dir); mark_inode_dirty(old_dir); if (new_inode) { - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); inode_dec_link_count(new_inode); } mark_buffer_dirty_inode(old_bh, old_dir); @@ -292,9 +292,9 @@ static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino) pos = (block - sblock) * BFS_BSIZE + off; if (pos >= dir->i_size) { dir->i_size += BFS_DIRENT_SIZE; - dir->i_ctime = current_time(dir); + inode_set_ctime_current(dir); } - dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); de->ino = cpu_to_le16((u16)ino); for (i = 0; i < BFS_NAMELEN; i++) diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 1926bec2c850..e6a76ae9eb44 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -82,10 +82,9 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) inode->i_blocks = BFS_FILEBLOCKS(di); inode->i_atime.tv_sec = le32_to_cpu(di->i_atime); inode->i_mtime.tv_sec = le32_to_cpu(di->i_mtime); - inode->i_ctime.tv_sec = le32_to_cpu(di->i_ctime); + inode_set_ctime(inode, le32_to_cpu(di->i_ctime), 0); inode->i_atime.tv_nsec = 0; inode->i_mtime.tv_nsec = 0; - inode->i_ctime.tv_nsec = 0; brelse(bh); unlock_new_inode(inode); @@ -143,7 +142,7 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc) di->i_nlink = cpu_to_le32(inode->i_nlink); di->i_atime = cpu_to_le32(inode->i_atime.tv_sec); di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); - di->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); + di->i_ctime = cpu_to_le32(inode_get_ctime(inode).tv_sec); i_sblock = BFS_I(inode)->i_sblock; di->i_sblock = cpu_to_le32(i_sblock); di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 1c6c5832af86..43b2a2851ba3 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -138,7 +138,7 @@ static int is_constdisp(struct elfhdr *hdr) static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, struct file *file) { - struct elf32_phdr *phdr; + struct elf_phdr *phdr; unsigned long size; int retval, loop; loff_t pos = params->hdr.e_phoff; @@ -560,8 +560,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, sp &= ~7UL; /* stack the load map(s) */ - len = sizeof(struct elf32_fdpic_loadmap); - len += sizeof(struct elf32_fdpic_loadseg) * exec_params->loadmap->nsegs; + len = sizeof(struct elf_fdpic_loadmap); + len += sizeof(struct elf_fdpic_loadseg) * exec_params->loadmap->nsegs; sp = (sp - len) & ~7UL; exec_params->map_addr = sp; @@ -571,8 +571,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, current->mm->context.exec_fdpic_loadmap = (unsigned long) sp; if (interp_params->loadmap) { - len = sizeof(struct elf32_fdpic_loadmap); - len += sizeof(struct elf32_fdpic_loadseg) * + len = sizeof(struct elf_fdpic_loadmap); + len += sizeof(struct elf_fdpic_loadseg) * interp_params->loadmap->nsegs; sp = (sp - len) & ~7UL; interp_params->map_addr = sp; @@ -740,13 +740,13 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, struct mm_struct *mm, const char *what) { - struct elf32_fdpic_loadmap *loadmap; + struct elf_fdpic_loadmap *loadmap; #ifdef CONFIG_MMU - struct elf32_fdpic_loadseg *mseg; + struct elf_fdpic_loadseg *mseg; unsigned long load_addr; #endif - struct elf32_fdpic_loadseg *seg; - struct elf32_phdr *phdr; + struct elf_fdpic_loadseg *seg; + struct elf_phdr *phdr; unsigned nloads, tmp; unsigned long stop; int loop, ret; @@ -766,7 +766,7 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, params->loadmap = loadmap; - loadmap->version = ELF32_FDPIC_LOADMAP_VERSION; + loadmap->version = ELF_FDPIC_LOADMAP_VERSION; loadmap->nsegs = nloads; /* map the requested LOADs into the memory space */ @@ -839,8 +839,8 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, if (phdr->p_vaddr >= seg->p_vaddr && phdr->p_vaddr + phdr->p_memsz <= seg->p_vaddr + seg->p_memsz) { - Elf32_Dyn __user *dyn; - Elf32_Sword d_tag; + Elf_Dyn __user *dyn; + Elf_Sword d_tag; params->dynamic_addr = (phdr->p_vaddr - seg->p_vaddr) + @@ -850,11 +850,11 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, * one item, and that the last item is a NULL * entry */ if (phdr->p_memsz == 0 || - phdr->p_memsz % sizeof(Elf32_Dyn) != 0) + phdr->p_memsz % sizeof(Elf_Dyn) != 0) goto dynamic_error; - tmp = phdr->p_memsz / sizeof(Elf32_Dyn); - dyn = (Elf32_Dyn __user *)params->dynamic_addr; + tmp = phdr->p_memsz / sizeof(Elf_Dyn); + dyn = (Elf_Dyn __user *)params->dynamic_addr; if (get_user(d_tag, &dyn[tmp - 1].d_tag) || d_tag != 0) goto dynamic_error; @@ -923,8 +923,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( struct file *file, struct mm_struct *mm) { - struct elf32_fdpic_loadseg *seg; - struct elf32_phdr *phdr; + struct elf_fdpic_loadseg *seg; + struct elf_phdr *phdr; unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0; int loop, ret; @@ -1007,8 +1007,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, struct file *file, struct mm_struct *mm) { - struct elf32_fdpic_loadseg *seg; - struct elf32_phdr *phdr; + struct elf_fdpic_loadseg *seg; + struct elf_phdr *phdr; unsigned long load_addr, delta_vaddr; int loop, dvset; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index bb202ad369d5..e0108d17b085 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -547,8 +547,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode) if (inode) { inode->i_ino = get_next_ino(); inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = - current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); } return inode; } diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index dfe29b29915f..caf0bbd028d1 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1844,9 +1844,9 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, inode->i_mtime.tv_nsec); btrfs_set_stack_timespec_sec(&inode_item->ctime, - inode->i_ctime.tv_sec); + inode_get_ctime(inode).tv_sec); btrfs_set_stack_timespec_nsec(&inode_item->ctime, - inode->i_ctime.tv_nsec); + inode_get_ctime(inode).tv_nsec); btrfs_set_stack_timespec_sec(&inode_item->otime, BTRFS_I(inode)->i_otime.tv_sec); @@ -1897,8 +1897,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime); inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime); - inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime); - inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime); + inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime), + btrfs_stack_timespec_nsec(&inode_item->ctime)); BTRFS_I(inode)->i_otime.tv_sec = btrfs_stack_timespec_sec(&inode_item->otime); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e8726a83b649..361535c71c0f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -876,9 +876,9 @@ static int prepare_uptodate_page(struct inode *inode, return 0; } -static unsigned int get_prepare_fgp_flags(bool nowait) +static fgf_t get_prepare_fgp_flags(bool nowait) { - unsigned int fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT; + fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT; if (nowait) fgp_flags |= FGP_NOWAIT; @@ -910,7 +910,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages, int i; unsigned long index = pos >> PAGE_SHIFT; gfp_t mask = get_prepare_gfp_flags(inode, nowait); - unsigned int fgp_flags = get_prepare_fgp_flags(nowait); + fgf_t fgp_flags = get_prepare_fgp_flags(nowait); int err = 0; int faili; @@ -1108,7 +1108,7 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode) static void update_time_for_write(struct inode *inode) { - struct timespec64 now; + struct timespec64 now, ctime; if (IS_NOCMTIME(inode)) return; @@ -1117,8 +1117,9 @@ static void update_time_for_write(struct inode *inode) if (!timespec64_equal(&inode->i_mtime, &now)) inode->i_mtime = now; - if (!timespec64_equal(&inode->i_ctime, &now)) - inode->i_ctime = now; + ctime = inode_get_ctime(inode); + if (!timespec64_equal(&ctime, &now)) + inode_set_ctime_to_ts(inode, now); if (IS_I_VERSION(inode)) inode_inc_iversion(inode); @@ -2471,10 +2472,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, */ inode_inc_iversion(&inode->vfs_inode); - if (!extent_info || extent_info->update_times) { - inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode); - inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime; - } + if (!extent_info || extent_info->update_times) + inode->vfs_inode.i_mtime = inode_set_ctime_current(&inode->vfs_inode); ret = btrfs_update_inode(trans, root, inode); if (ret) @@ -2715,8 +2714,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) ASSERT(trans != NULL); inode_inc_iversion(inode); - inode->i_mtime = current_time(inode); - inode->i_ctime = inode->i_mtime; + inode->i_mtime = inode_set_ctime_current(inode); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); updated_inode = true; btrfs_end_transaction(trans); @@ -2733,11 +2731,10 @@ out_only_mutex: * for detecting, at fsync time, if the inode isn't yet in the * log tree or it's there but not up to date. */ - struct timespec64 now = current_time(inode); + struct timespec64 now = inode_set_ctime_current(inode); inode_inc_iversion(inode); inode->i_mtime = now; - inode->i_ctime = now; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -2808,7 +2805,7 @@ static int btrfs_fallocate_update_isize(struct inode *inode, if (IS_ERR(trans)) return PTR_ERR(trans); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); i_size_write(inode, end); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5e1b15e69ed6..7814b9d654ce 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3760,8 +3760,8 @@ static int btrfs_read_locked_inode(struct inode *inode, inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime); inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime); - inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime); - inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime); + inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime), + btrfs_timespec_nsec(leaf, &inode_item->ctime)); BTRFS_I(inode)->i_otime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->otime); @@ -3932,9 +3932,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, inode->i_mtime.tv_nsec); btrfs_set_token_timespec_sec(&token, &item->ctime, - inode->i_ctime.tv_sec); + inode_get_ctime(inode).tv_sec); btrfs_set_token_timespec_nsec(&token, &item->ctime, - inode->i_ctime.tv_nsec); + inode_get_ctime(inode).tv_nsec); btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime.tv_sec); @@ -4132,9 +4132,8 @@ err: btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); inode_inc_iversion(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); - inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode); - dir->vfs_inode.i_mtime = inode->vfs_inode.i_ctime; - dir->vfs_inode.i_ctime = inode->vfs_inode.i_ctime; + inode_set_ctime_current(&inode->vfs_inode); + dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode); ret = btrfs_update_inode(trans, root, dir); out: return ret; @@ -4307,8 +4306,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2); inode_inc_iversion(&dir->vfs_inode); - dir->vfs_inode.i_mtime = current_time(&dir->vfs_inode); - dir->vfs_inode.i_ctime = dir->vfs_inode.i_mtime; + dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode); ret = btrfs_update_inode_fallback(trans, root, dir); if (ret) btrfs_abort_transaction(trans, ret); @@ -4958,8 +4956,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) if (newsize != oldsize) { inode_inc_iversion(inode); if (!(mask & (ATTR_CTIME | ATTR_MTIME))) { - inode->i_mtime = current_time(inode); - inode->i_ctime = inode->i_mtime; + inode->i_mtime = inode_set_ctime_current(inode); } } @@ -5603,9 +5600,8 @@ static struct inode *new_simple_dir(struct inode *dir, inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); inode->i_atime = dir->i_atime; - inode->i_ctime = dir->i_ctime; BTRFS_I(inode)->i_otime = inode->i_mtime; inode->i_uid = dir->i_uid; inode->i_gid = dir->i_gid; @@ -6025,8 +6021,7 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode) * This is a copy of file_update_time. We need this so we can return error on * ENOSPC for updating the inode in the case of file write and mmap writes. */ -static int btrfs_update_time(struct inode *inode, struct timespec64 *now, - int flags) +static int btrfs_update_time(struct inode *inode, int flags) { struct btrfs_root *root = BTRFS_I(inode)->root; bool dirty = flags & ~S_VERSION; @@ -6034,14 +6029,7 @@ static int btrfs_update_time(struct inode *inode, struct timespec64 *now, if (btrfs_root_readonly(root)) return -EROFS; - if (flags & S_VERSION) - dirty |= inode_maybe_inc_iversion(inode, dirty); - if (flags & S_CTIME) - inode->i_ctime = *now; - if (flags & S_MTIME) - inode->i_mtime = *now; - if (flags & S_ATIME) - inode->i_atime = *now; + dirty = inode_update_timestamps(inode, flags); return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0; } @@ -6289,9 +6277,8 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, goto discard; } - inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); inode->i_atime = inode->i_mtime; - inode->i_ctime = inode->i_mtime; BTRFS_I(inode)->i_otime = inode->i_mtime; /* @@ -6456,12 +6443,10 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, * log replay procedure is responsible for setting them to their correct * values (the ones it had when the fsync was done). */ - if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) { - struct timespec64 now = current_time(&parent_inode->vfs_inode); + if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) + parent_inode->vfs_inode.i_mtime = + inode_set_ctime_current(&parent_inode->vfs_inode); - parent_inode->vfs_inode.i_mtime = now; - parent_inode->vfs_inode.i_ctime = now; - } ret = btrfs_update_inode(trans, root, parent_inode); if (ret) btrfs_abort_transaction(trans, ret); @@ -6601,7 +6586,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, BTRFS_I(inode)->dir_index = 0ULL; inc_nlink(inode); inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); @@ -8667,7 +8652,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; spin_lock(&BTRFS_I(inode)->lock); @@ -8691,7 +8676,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; - struct timespec64 ctime = current_time(old_inode); struct btrfs_rename_ctx old_rename_ctx; struct btrfs_rename_ctx new_rename_ctx; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); @@ -8822,12 +8806,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); inode_inc_iversion(new_inode); - old_dir->i_mtime = ctime; - old_dir->i_ctime = ctime; - new_dir->i_mtime = ctime; - new_dir->i_ctime = ctime; - old_inode->i_ctime = ctime; - new_inode->i_ctime = ctime; + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); if (old_dentry->d_parent != new_dentry->d_parent) { btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), @@ -9091,11 +9070,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); - old_dir->i_mtime = current_time(old_dir); - old_dir->i_ctime = old_dir->i_mtime; - new_dir->i_mtime = old_dir->i_mtime; - new_dir->i_ctime = old_dir->i_mtime; - old_inode->i_ctime = old_dir->i_mtime; + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), @@ -9117,7 +9092,6 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (new_inode) { inode_inc_iversion(new_inode); - new_inode->i_ctime = current_time(new_inode); if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); @@ -9652,7 +9626,7 @@ next: *alloc_hint = ins.objectid + ins.offset; inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && (actual_len > inode->i_size) && diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d27b0d86b8e2..75ab766fe156 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -384,7 +384,7 @@ update_flags: binode->flags = binode_flags; btrfs_sync_inode_flags_to_i_flags(inode); inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); out_end_trans: diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 005751a12911..40f2d9f1a17a 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -8,8 +8,6 @@ #include <linux/math64.h> #include <linux/rbtree.h> -#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) - /* * Enumerate bits using enum autoincrement. Define the @name as the n-th bit. */ diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 0474bbe39da7..65d2bd6910f2 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -30,8 +30,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, inode_inc_iversion(inode); if (!no_time_update) { - inode->i_mtime = current_time(inode); - inode->i_ctime = inode->i_mtime; + inode->i_mtime = inode_set_ctime_current(inode); } /* * We round up to the block size at eof when determining which diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3b60e56e5e02..c780d3729463 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1860,8 +1860,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + fname.disk_name.len * 2); - parent_inode->i_mtime = current_time(parent_inode); - parent_inode->i_ctime = parent_inode->i_mtime; + parent_inode->i_mtime = inode_set_ctime_current(parent_inode); ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode)); if (ret) { btrfs_abort_transaction(trans, ret); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 537eb3de8809..cbb17b542131 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4148,9 +4148,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, inode->i_mtime.tv_nsec); btrfs_set_token_timespec_sec(&token, &item->ctime, - inode->i_ctime.tv_sec); + inode_get_ctime(inode).tv_sec); btrfs_set_token_timespec_nsec(&token, &item->ctime, - inode->i_ctime.tv_nsec); + inode_get_ctime(inode).tv_nsec); /* * We do not need to set the nbytes field, in fact during a fast fsync diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e3a3769fd92e..5a5a8d488a7b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1911,15 +1911,13 @@ out: static void update_dev_time(const char *device_path) { struct path path; - struct timespec64 now; int ret; ret = kern_path(device_path, LOOKUP_FOLLOW, &path); if (ret) return; - now = current_time(d_inode(path.dentry)); - inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME | S_VERSION); + inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION); path_put(&path); } diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index fc4b20c2688a..96828a13dd43 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -264,7 +264,7 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name, goto out; inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) btrfs_abort_transaction(trans, ret); @@ -407,7 +407,7 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, ret = btrfs_set_prop(trans, inode, name, value, size, flags); if (!ret) { inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) btrfs_abort_transaction(trans, ret); diff --git a/fs/buffer.c b/fs/buffer.c index bd091329026c..a6785cd07081 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -49,6 +49,7 @@ #include <trace/events/block.h> #include <linux/fscrypt.h> #include <linux/fsverity.h> +#include <linux/sched/isolation.h> #include "internal.h" @@ -562,12 +563,6 @@ repeat: return err; } -void emergency_thaw_bdev(struct super_block *sb) -{ - while (sb->s_bdev && !thaw_bdev(sb->s_bdev)) - printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev); -} - /** * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers * @mapping: the mapping which wants those buffers written @@ -1225,19 +1220,14 @@ EXPORT_SYMBOL(mark_buffer_dirty); void mark_buffer_write_io_error(struct buffer_head *bh) { - struct super_block *sb; - set_buffer_write_io_error(bh); /* FIXME: do we need to set this in both places? */ if (bh->b_folio && bh->b_folio->mapping) mapping_set_error(bh->b_folio->mapping, -EIO); - if (bh->b_assoc_map) + if (bh->b_assoc_map) { mapping_set_error(bh->b_assoc_map, -EIO); - rcu_read_lock(); - sb = READ_ONCE(bh->b_bdev->bd_super); - if (sb) - errseq_set(&sb->s_wb_err, -EIO); - rcu_read_unlock(); + errseq_set(&bh->b_assoc_map->host->i_sb->s_wb_err, -EIO); + } } EXPORT_SYMBOL(mark_buffer_write_io_error); @@ -1352,7 +1342,7 @@ static void bh_lru_install(struct buffer_head *bh) * failing page migration. * Skip putting upcoming bh into bh_lru until migration is done. */ - if (lru_cache_disabled()) { + if (lru_cache_disabled() || cpu_is_isolated(smp_processor_id())) { bh_lru_unlock(); return; } @@ -1382,6 +1372,10 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) check_irqs_on(); bh_lru_lock(); + if (cpu_is_isolated(smp_processor_id())) { + bh_lru_unlock(); + return NULL; + } for (i = 0; i < BH_LRU_SIZE; i++) { struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]); @@ -1539,21 +1533,6 @@ void invalidate_bh_lrus_cpu(void) bh_lru_unlock(); } -void set_bh_page(struct buffer_head *bh, - struct page *page, unsigned long offset) -{ - bh->b_page = page; - BUG_ON(offset >= PAGE_SIZE); - if (PageHighMem(page)) - /* - * This catches illegal uses and preserves the offset: - */ - bh->b_data = (char *)(0 + offset); - else - bh->b_data = page_address(page) + offset; -} -EXPORT_SYMBOL(set_bh_page); - void folio_set_bh(struct buffer_head *bh, struct folio *folio, unsigned long offset) { @@ -2032,7 +2011,7 @@ void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to) } EXPORT_SYMBOL(folio_zero_new_buffers); -static void +static int iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, const struct iomap *iomap) { @@ -2046,7 +2025,8 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, * current block, then do not map the buffer and let the caller * handle it. */ - BUG_ON(offset >= iomap->offset + iomap->length); + if (offset >= iomap->offset + iomap->length) + return -EIO; switch (iomap->type) { case IOMAP_HOLE: @@ -2058,7 +2038,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, if (!buffer_uptodate(bh) || (offset >= i_size_read(inode))) set_buffer_new(bh); - break; + return 0; case IOMAP_DELALLOC: if (!buffer_uptodate(bh) || (offset >= i_size_read(inode))) @@ -2066,7 +2046,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, set_buffer_uptodate(bh); set_buffer_mapped(bh); set_buffer_delay(bh); - break; + return 0; case IOMAP_UNWRITTEN: /* * For unwritten regions, we always need to ensure that regions @@ -2083,7 +2063,10 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, bh->b_blocknr = (iomap->addr + offset - iomap->offset) >> inode->i_blkbits; set_buffer_mapped(bh); - break; + return 0; + default: + WARN_ON_ONCE(1); + return -EIO; } } @@ -2124,13 +2107,12 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); - if (get_block) { + if (get_block) err = get_block(inode, block, bh, 1); - if (err) - break; - } else { - iomap_to_bh(inode, block, bh, iomap); - } + else + err = iomap_to_bh(inode, block, bh, iomap); + if (err) + break; if (buffer_new(bh)) { clean_bdev_bh_alias(bh); @@ -2180,8 +2162,7 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, } EXPORT_SYMBOL(__block_write_begin); -static int __block_commit_write(struct inode *inode, struct folio *folio, - size_t from, size_t to) +static void __block_commit_write(struct folio *folio, size_t from, size_t to) { size_t block_start, block_end; bool partial = false; @@ -2216,7 +2197,6 @@ static int __block_commit_write(struct inode *inode, struct folio *folio, */ if (!partial) folio_mark_uptodate(folio); - return 0; } /* @@ -2253,7 +2233,6 @@ int block_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct folio *folio = page_folio(page); - struct inode *inode = mapping->host; size_t start = pos - folio_pos(folio); if (unlikely(copied < len)) { @@ -2277,7 +2256,7 @@ int block_write_end(struct file *file, struct address_space *mapping, flush_dcache_folio(folio); /* This could be a short (even 0-length) commit */ - __block_commit_write(inode, folio, start, start + copied); + __block_commit_write(folio, start, start + copied); return copied; } @@ -2598,12 +2577,10 @@ int cont_write_begin(struct file *file, struct address_space *mapping, } EXPORT_SYMBOL(cont_write_begin); -int block_commit_write(struct page *page, unsigned from, unsigned to) +void block_commit_write(struct page *page, unsigned from, unsigned to) { struct folio *folio = page_folio(page); - struct inode *inode = folio->mapping->host; - __block_commit_write(inode, folio, from, to); - return 0; + __block_commit_write(folio, from, to); } EXPORT_SYMBOL(block_commit_write); @@ -2649,11 +2626,11 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, end = size - folio_pos(folio); ret = __block_write_begin_int(folio, 0, end, get_block, NULL); - if (!ret) - ret = __block_commit_write(inode, folio, 0, end); - - if (unlikely(ret < 0)) + if (unlikely(ret)) goto out_unlock; + + __block_commit_write(folio, 0, end); + folio_mark_dirty(folio); folio_wait_stable(folio); return 0; diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 175a25fcade8..009d23cd435b 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -259,9 +259,7 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret) _enter("%ld", ret); - /* Tell lockdep we inherited freeze protection from submission thread */ - __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); - __sb_end_write(inode->i_sb, SB_FREEZE_WRITE); + kiocb_end_write(iocb); if (ret < 0) trace_cachefiles_io_error(object, inode, ret, @@ -286,7 +284,6 @@ int __cachefiles_write(struct cachefiles_object *object, { struct cachefiles_cache *cache; struct cachefiles_kiocb *ki; - struct inode *inode; unsigned int old_nofs; ssize_t ret; size_t len = iov_iter_count(iter); @@ -322,19 +319,12 @@ int __cachefiles_write(struct cachefiles_object *object, ki->iocb.ki_complete = cachefiles_write_complete; atomic_long_add(ki->b_writing, &cache->b_writing); - /* Open-code file_start_write here to grab freeze protection, which - * will be released by another thread in aio_complete_rw(). Fool - * lockdep by telling it the lock got released so that it doesn't - * complain about the held lock when we return to userspace. - */ - inode = file_inode(file); - __sb_start_write(inode->i_sb, SB_FREEZE_WRITE); - __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); + kiocb_start_write(&ki->iocb); get_file(ki->iocb.ki_filp); cachefiles_grab_object(object, cachefiles_obj_get_ioreq); - trace_cachefiles_write(object, inode, ki->iocb.ki_pos, len); + trace_cachefiles_write(object, file_inode(file), ki->iocb.ki_pos, len); old_nofs = memalloc_nofs_save(); ret = cachefiles_inject_write_error(); if (ret == 0) diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index d9d22d0ec38a..7bf7a5fcc045 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -585,6 +585,8 @@ static bool cachefiles_open_file(struct cachefiles_object *object, if (ret < 0) goto check_failed; + clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &object->cookie->flags); + object->file = file; /* Always update the atime on an object we've just looked up (this is diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 50c635dc7f71..1f77ca04c426 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -12,3 +12,4 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ ceph-$(CONFIG_CEPH_FSCACHE) += cache.o ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o +ceph-$(CONFIG_FS_ENCRYPTION) += crypto.o diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 6945a938d396..c53a1d220622 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -93,7 +93,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, char *value = NULL; struct iattr newattrs; struct inode *inode = d_inode(dentry); - struct timespec64 old_ctime = inode->i_ctime; + struct timespec64 old_ctime = inode_get_ctime(inode); umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; if (ceph_snap(inode) != CEPH_NOSNAP) { @@ -140,7 +140,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, newattrs.ia_ctime = current_time(inode); newattrs.ia_mode = new_mode; newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - ret = __ceph_setattr(inode, &newattrs); + ret = __ceph_setattr(inode, &newattrs, NULL); if (ret) goto out_free; } @@ -151,7 +151,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, newattrs.ia_ctime = old_ctime; newattrs.ia_mode = old_mode; newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - __ceph_setattr(inode, &newattrs); + __ceph_setattr(inode, &newattrs, NULL); } goto out_free; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 59cbfb80edbd..f4863078f7fe 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -18,6 +18,7 @@ #include "mds_client.h" #include "cache.h" #include "metric.h" +#include "crypto.h" #include <linux/ceph/osd_client.h> #include <linux/ceph/striper.h> @@ -242,11 +243,13 @@ static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq) static void finish_netfs_read(struct ceph_osd_request *req) { - struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode); + struct inode *inode = req->r_inode; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); struct netfs_io_subrequest *subreq = req->r_priv; - int num_pages; + struct ceph_osd_req_op *op = &req->r_ops[0]; int err = req->r_result; + bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ); ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, osd_data->length, err); @@ -260,14 +263,29 @@ static void finish_netfs_read(struct ceph_osd_request *req) else if (err == -EBLOCKLISTED) fsc->blocklisted = true; - if (err >= 0 && err < subreq->len) - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + if (err >= 0) { + if (sparse && err > 0) + err = ceph_sparse_ext_map_end(op); + if (err < subreq->len) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + if (IS_ENCRYPTED(inode) && err > 0) { + err = ceph_fscrypt_decrypt_extents(inode, + osd_data->pages, subreq->start, + op->extent.sparse_ext, + op->extent.sparse_ext_cnt); + if (err > subreq->len) + err = subreq->len; + } + } + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { + ceph_put_page_vector(osd_data->pages, + calc_pages_for(osd_data->alignment, + osd_data->length), false); + } netfs_subreq_terminated(subreq, err, false); - - num_pages = calc_pages_for(osd_data->alignment, osd_data->length); - ceph_put_page_vector(osd_data->pages, num_pages, false); iput(req->r_inode); + ceph_dec_osd_stopping_blocker(fsc->mdsc); } static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) @@ -334,10 +352,10 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) struct ceph_osd_request *req = NULL; struct ceph_vino vino = ceph_vino(inode); struct iov_iter iter; - struct page **pages; - size_t page_off; int err = 0; u64 len = subreq->len; + bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD); + u64 off = subreq->start; if (ceph_inode_is_shutdown(inode)) { err = -EIO; @@ -347,8 +365,10 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq)) return; - req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len, - 0, 1, CEPH_OSD_OP_READ, + ceph_fscrypt_adjust_off_and_len(inode, &off, &len); + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, + off, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica, NULL, ci->i_truncate_seq, ci->i_truncate_size, false); if (IS_ERR(req)) { @@ -357,20 +377,48 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) goto out; } + if (sparse) { + err = ceph_alloc_sparse_ext_map(&req->r_ops[0]); + if (err) + goto out; + } + dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len); + iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); - err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off); - if (err < 0) { - dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err); - goto out; - } - /* should always give us a page-aligned read */ - WARN_ON_ONCE(page_off); - len = err; - err = 0; + /* + * FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for + * encrypted inodes. We'd need infrastructure that handles an iov_iter + * instead of page arrays, and we don't have that as of yet. Once the + * dust settles on the write helpers and encrypt/decrypt routines for + * netfs, we should be able to rework this. + */ + if (IS_ENCRYPTED(inode)) { + struct page **pages; + size_t page_off; + + err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off); + if (err < 0) { + dout("%s: iov_ter_get_pages_alloc returned %d\n", + __func__, err); + goto out; + } + + /* should always give us a page-aligned read */ + WARN_ON_ONCE(page_off); + len = err; + err = 0; - osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, + false); + } else { + osd_req_op_extent_osd_iter(req, 0, &iter); + } + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { + err = -EIO; + goto out; + } req->r_callback = finish_netfs_read; req->r_priv = subreq; req->r_inode = inode; @@ -571,10 +619,12 @@ static u64 get_writepages_data_length(struct inode *inode, struct page *page, u64 start) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_snap_context *snapc = page_snap_context(page); + struct ceph_snap_context *snapc; struct ceph_cap_snap *capsnap = NULL; u64 end = i_size_read(inode); + u64 ret; + snapc = page_snap_context(ceph_fscrypt_pagecache_page(page)); if (snapc != ci->i_head_snapc) { bool found = false; spin_lock(&ci->i_ceph_lock); @@ -589,9 +639,12 @@ static u64 get_writepages_data_length(struct inode *inode, spin_unlock(&ci->i_ceph_lock); WARN_ON(!found); } - if (end > page_offset(page) + thp_size(page)) - end = page_offset(page) + thp_size(page); - return end > start ? end - start : 0; + if (end > ceph_fscrypt_page_offset(page) + thp_size(page)) + end = ceph_fscrypt_page_offset(page) + thp_size(page); + ret = end > start ? end - start : 0; + if (ret && fscrypt_is_bounce_page(page)) + ret = round_up(ret, CEPH_FSCRYPT_BLOCK_SIZE); + return ret; } /* @@ -610,10 +663,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) loff_t page_off = page_offset(page); int err; loff_t len = thp_size(page); + loff_t wlen; struct ceph_writeback_ctl ceph_wbc; struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_osd_request *req; bool caching = ceph_is_cache_enabled(inode); + struct page *bounce_page = NULL; dout("writepage %p idx %lu\n", page, page->index); @@ -649,31 +704,51 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) if (ceph_wbc.i_size < page_off + len) len = ceph_wbc.i_size - page_off; + wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len; dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n", - inode, page, page->index, page_off, len, snapc, snapc->seq); + inode, page, page->index, page_off, wlen, snapc, snapc->seq); if (atomic_long_inc_return(&fsc->writeback_count) > CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) fsc->write_congested = true; - req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1, - CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, - ceph_wbc.truncate_seq, ceph_wbc.truncate_size, - true); + req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), + page_off, &wlen, 0, 1, CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE, snapc, + ceph_wbc.truncate_seq, + ceph_wbc.truncate_size, true); if (IS_ERR(req)) { redirty_page_for_writepage(wbc, page); return PTR_ERR(req); } + if (wlen < len) + len = wlen; + set_page_writeback(page); if (caching) ceph_set_page_fscache(page); ceph_fscache_write_to_cache(inode, page_off, len, caching); + if (IS_ENCRYPTED(inode)) { + bounce_page = fscrypt_encrypt_pagecache_blocks(page, + CEPH_FSCRYPT_BLOCK_SIZE, 0, + GFP_NOFS); + if (IS_ERR(bounce_page)) { + redirty_page_for_writepage(wbc, page); + end_page_writeback(page); + ceph_osdc_put_request(req); + return PTR_ERR(bounce_page); + } + } + /* it may be a short write due to an object boundary */ WARN_ON_ONCE(len > thp_size(page)); - osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false); - dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len); + osd_req_op_extent_osd_data_pages(req, 0, + bounce_page ? &bounce_page : &page, wlen, 0, + false, false); + dout("writepage %llu~%llu (%llu bytes, %sencrypted)\n", + page_off, len, wlen, IS_ENCRYPTED(inode) ? "" : "not "); req->r_mtime = inode->i_mtime; ceph_osdc_start_request(osdc, req); @@ -681,7 +756,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); - + fscrypt_free_bounce_page(bounce_page); ceph_osdc_put_request(req); if (err == 0) err = len; @@ -800,6 +875,11 @@ static void writepages_finish(struct ceph_osd_request *req) total_pages += num_pages; for (j = 0; j < num_pages; j++) { page = osd_data->pages[j]; + if (fscrypt_is_bounce_page(page)) { + page = fscrypt_pagecache_page(page); + fscrypt_free_bounce_page(osd_data->pages[j]); + osd_data->pages[j] = page; + } BUG_ON(!page); WARN_ON(!PageUptodate(page)); @@ -835,6 +915,7 @@ static void writepages_finish(struct ceph_osd_request *req) else kfree(osd_data->pages); ceph_osdc_put_request(req); + ceph_dec_osd_stopping_blocker(fsc->mdsc); } /* @@ -1070,9 +1151,28 @@ get_more_pages: fsc->mount_options->congestion_kb)) fsc->write_congested = true; - pages[locked_pages++] = page; - fbatch.folios[i] = NULL; + if (IS_ENCRYPTED(inode)) { + pages[locked_pages] = + fscrypt_encrypt_pagecache_blocks(page, + PAGE_SIZE, 0, + locked_pages ? GFP_NOWAIT : GFP_NOFS); + if (IS_ERR(pages[locked_pages])) { + if (PTR_ERR(pages[locked_pages]) == -EINVAL) + pr_err("%s: inode->i_blkbits=%hhu\n", + __func__, inode->i_blkbits); + /* better not fail on first page! */ + BUG_ON(locked_pages == 0); + pages[locked_pages] = NULL; + redirty_page_for_writepage(wbc, page); + unlock_page(page); + break; + } + ++locked_pages; + } else { + pages[locked_pages++] = page; + } + fbatch.folios[i] = NULL; len += thp_size(page); } @@ -1100,7 +1200,7 @@ get_more_pages: } new_request: - offset = page_offset(pages[0]); + offset = ceph_fscrypt_page_offset(pages[0]); len = wsize; req = ceph_osdc_new_request(&fsc->client->osdc, @@ -1121,9 +1221,13 @@ new_request: ceph_wbc.truncate_size, true); BUG_ON(IS_ERR(req)); } - BUG_ON(len < page_offset(pages[locked_pages - 1]) + - thp_size(page) - offset); + BUG_ON(len < ceph_fscrypt_page_offset(pages[locked_pages - 1]) + + thp_size(pages[locked_pages - 1]) - offset); + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { + rc = -EIO; + goto release_folios; + } req->r_callback = writepages_finish; req->r_inode = inode; @@ -1132,7 +1236,9 @@ new_request: data_pages = pages; op_idx = 0; for (i = 0; i < locked_pages; i++) { - u64 cur_offset = page_offset(pages[i]); + struct page *page = ceph_fscrypt_pagecache_page(pages[i]); + + u64 cur_offset = page_offset(page); /* * Discontinuity in page range? Ceph can handle that by just passing * multiple extents in the write op. @@ -1161,9 +1267,9 @@ new_request: op_idx++; } - set_page_writeback(pages[i]); + set_page_writeback(page); if (caching) - ceph_set_page_fscache(pages[i]); + ceph_set_page_fscache(page); len += thp_size(page); } ceph_fscache_write_to_cache(inode, offset, len, caching); @@ -1179,8 +1285,16 @@ new_request: offset); len = max(len, min_len); } + if (IS_ENCRYPTED(inode)) + len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE); + dout("writepages got pages at %llu~%llu\n", offset, len); + if (IS_ENCRYPTED(inode) && + ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK)) + pr_warn("%s: bad encrypted write offset=%lld len=%llu\n", + __func__, offset, len); + osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len, 0, from_pool, false); osd_req_op_extent_update(req, op_idx, len); diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 177d8e8d73fe..de1dee46d3df 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -36,6 +36,8 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) &ci->i_vino, sizeof(ci->i_vino), &ci->i_version, sizeof(ci->i_version), i_size_read(inode)); + if (ci->netfs.cache) + mapping_set_release_always(inode->i_mapping); } void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info *ci) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index e2bb0d0072da..14215ec646f7 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -14,6 +14,7 @@ #include "super.h" #include "mds_client.h" #include "cache.h" +#include "crypto.h" #include <linux/ceph/decode.h> #include <linux/ceph/messenger.h> @@ -1216,15 +1217,11 @@ struct cap_msg_args { umode_t mode; bool inline_data; bool wake; + bool encrypted; + u32 fscrypt_auth_len; + u8 fscrypt_auth[sizeof(struct ceph_fscrypt_auth)]; // for context }; -/* - * cap struct size + flock buffer size + inline version + inline data size + - * osd_epoch_barrier + oldest_flush_tid - */ -#define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \ - 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4) - /* Marshal up the cap msg to the MDS */ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) { @@ -1240,7 +1237,7 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) arg->size, arg->max_size, arg->xattr_version, arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0); - msg->hdr.version = cpu_to_le16(10); + msg->hdr.version = cpu_to_le16(12); msg->hdr.tid = cpu_to_le64(arg->flush_tid); fc = msg->front.iov_base; @@ -1257,7 +1254,13 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) fc->ino = cpu_to_le64(arg->ino); fc->snap_follows = cpu_to_le64(arg->follows); - fc->size = cpu_to_le64(arg->size); +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + if (arg->encrypted) + fc->size = cpu_to_le64(round_up(arg->size, + CEPH_FSCRYPT_BLOCK_SIZE)); + else +#endif + fc->size = cpu_to_le64(arg->size); fc->max_size = cpu_to_le64(arg->max_size); ceph_encode_timespec64(&fc->mtime, &arg->mtime); ceph_encode_timespec64(&fc->atime, &arg->atime); @@ -1311,6 +1314,27 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) /* Advisory flags (version 10) */ ceph_encode_32(&p, arg->flags); + + /* dirstats (version 11) - these are r/o on the client */ + ceph_encode_64(&p, 0); + ceph_encode_64(&p, 0); + +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + /* + * fscrypt_auth and fscrypt_file (version 12) + * + * fscrypt_auth holds the crypto context (if any). fscrypt_file + * tracks the real i_size as an __le64 field (and we use a rounded-up + * i_size in the traditional size field). + */ + ceph_encode_32(&p, arg->fscrypt_auth_len); + ceph_encode_copy(&p, arg->fscrypt_auth, arg->fscrypt_auth_len); + ceph_encode_32(&p, sizeof(__le64)); + ceph_encode_64(&p, arg->size); +#else /* CONFIG_FS_ENCRYPTION */ + ceph_encode_32(&p, 0); + ceph_encode_32(&p, 0); +#endif /* CONFIG_FS_ENCRYPTION */ } /* @@ -1378,7 +1402,6 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, arg->follows = flushing ? ci->i_head_snapc->seq : 0; arg->flush_tid = flush_tid; arg->oldest_flush_tid = oldest_flush_tid; - arg->size = i_size_read(inode); ci->i_reported_size = arg->size; arg->max_size = ci->i_wanted_max_size; @@ -1400,7 +1423,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, arg->mtime = inode->i_mtime; arg->atime = inode->i_atime; - arg->ctime = inode->i_ctime; + arg->ctime = inode_get_ctime(inode); arg->btime = ci->i_btime; arg->change_attr = inode_peek_iversion_raw(inode); @@ -1432,8 +1455,39 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, } } arg->flags = flags; + arg->encrypted = IS_ENCRYPTED(inode); +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + if (ci->fscrypt_auth_len && + WARN_ON_ONCE(ci->fscrypt_auth_len > sizeof(struct ceph_fscrypt_auth))) { + /* Don't set this if it's too big */ + arg->fscrypt_auth_len = 0; + } else { + arg->fscrypt_auth_len = ci->fscrypt_auth_len; + memcpy(arg->fscrypt_auth, ci->fscrypt_auth, + min_t(size_t, ci->fscrypt_auth_len, + sizeof(arg->fscrypt_auth))); + } +#endif /* CONFIG_FS_ENCRYPTION */ } +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +#define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \ + 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4 + 8) + +static inline int cap_msg_size(struct cap_msg_args *arg) +{ + return CAP_MSG_FIXED_FIELDS + arg->fscrypt_auth_len; +} +#else +#define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \ + 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4) + +static inline int cap_msg_size(struct cap_msg_args *arg) +{ + return CAP_MSG_FIXED_FIELDS; +} +#endif /* CONFIG_FS_ENCRYPTION */ + /* * Send a cap msg on the given inode. * @@ -1444,7 +1498,8 @@ static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) struct ceph_msg *msg; struct inode *inode = &ci->netfs.inode; - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(arg), GFP_NOFS, + false); if (!msg) { pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n", ceph_vinop(inode), ceph_cap_string(arg->dirty), @@ -1470,10 +1525,6 @@ static inline int __send_flush_snap(struct inode *inode, struct cap_msg_args arg; struct ceph_msg *msg; - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); - if (!msg) - return -ENOMEM; - arg.session = session; arg.ino = ceph_vino(inode).ino; arg.cid = 0; @@ -1510,6 +1561,15 @@ static inline int __send_flush_snap(struct inode *inode, arg.inline_data = capsnap->inline_data; arg.flags = 0; arg.wake = false; + arg.encrypted = IS_ENCRYPTED(inode); + + /* No fscrypt_auth changes from a capsnap.*/ + arg.fscrypt_auth_len = 0; + + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(&arg), + GFP_NOFS, false); + if (!msg) + return -ENOMEM; encode_cap_msg(msg, &arg); ceph_con_send(&arg.session->s_con, msg); @@ -2900,10 +2960,9 @@ int ceph_try_get_caps(struct inode *inode, int need, int want, * due to a small max_size, make sure we check_max_size (and possibly * ask the mds) so we don't get hung up indefinitely. */ -int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got) +int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need, + int want, loff_t endoff, int *got) { - struct ceph_file_info *fi = filp->private_data; - struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); int ret, _got, flags; @@ -2912,7 +2971,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got if (ret < 0) return ret; - if ((fi->fmode & CEPH_FILE_MODE_WR) && + if (fi && (fi->fmode & CEPH_FILE_MODE_WR) && fi->filp_gen != READ_ONCE(fsc->filp_gen)) return -EBADF; @@ -2965,7 +3024,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got continue; } - if ((fi->fmode & CEPH_FILE_MODE_WR) && + if (fi && (fi->fmode & CEPH_FILE_MODE_WR) && fi->filp_gen != READ_ONCE(fsc->filp_gen)) { if (ret >= 0 && _got) ceph_put_cap_refs(ci, _got); @@ -3028,6 +3087,15 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got return 0; } +int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, + int *got) +{ + struct ceph_file_info *fi = filp->private_data; + struct inode *inode = file_inode(filp); + + return __ceph_get_caps(inode, fi, need, want, endoff, got); +} + /* * Take cap refs. Caller must already know we hold at least one ref * on the caps in question or we don't know this is safe. @@ -3323,6 +3391,9 @@ struct cap_extra_info { /* currently issued */ int issued; struct timespec64 btime; + u8 *fscrypt_auth; + u32 fscrypt_auth_len; + u64 fscrypt_file_size; }; /* @@ -3355,6 +3426,14 @@ static void handle_cap_grant(struct inode *inode, bool deleted_inode = false; bool fill_inline = false; + /* + * If there is at least one crypto block then we'll trust + * fscrypt_file_size. If the real length of the file is 0, then + * ignore it (it has probably been truncated down to 0 by the MDS). + */ + if (IS_ENCRYPTED(inode) && size) + size = extra_info->fscrypt_file_size; + dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", inode, cap, session->s_mds, seq, ceph_cap_string(newcaps)); dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, @@ -3421,6 +3500,14 @@ static void handle_cap_grant(struct inode *inode, dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, from_kuid(&init_user_ns, inode->i_uid), from_kgid(&init_user_ns, inode->i_gid)); +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + if (ci->fscrypt_auth_len != extra_info->fscrypt_auth_len || + memcmp(ci->fscrypt_auth, extra_info->fscrypt_auth, + ci->fscrypt_auth_len)) + pr_warn_ratelimited("%s: cap grant attempt to change fscrypt_auth on non-I_NEW inode (old len %d new len %d)\n", + __func__, ci->fscrypt_auth_len, + extra_info->fscrypt_auth_len); +#endif } if ((newcaps & CEPH_CAP_LINK_SHARED) && @@ -3837,7 +3924,8 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, */ static bool handle_cap_trunc(struct inode *inode, struct ceph_mds_caps *trunc, - struct ceph_mds_session *session) + struct ceph_mds_session *session, + struct cap_extra_info *extra_info) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; @@ -3854,8 +3942,16 @@ static bool handle_cap_trunc(struct inode *inode, issued |= implemented | dirty; - dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n", - inode, mds, seq, truncate_size, truncate_seq); + /* + * If there is at least one crypto block then we'll trust + * fscrypt_file_size. If the real length of the file is 0, then + * ignore it (it has probably been truncated down to 0 by the MDS). + */ + if (IS_ENCRYPTED(inode) && size) + size = extra_info->fscrypt_file_size; + + dout("%s inode %p mds%d seq %d to %lld truncate seq %d\n", + __func__, inode, mds, seq, truncate_size, truncate_seq); queue_trunc = ceph_fill_file_size(inode, issued, truncate_seq, truncate_size, size); return queue_trunc; @@ -4075,6 +4171,52 @@ retry: *target_cap = cap; } +#ifdef CONFIG_FS_ENCRYPTION +static int parse_fscrypt_fields(void **p, void *end, + struct cap_extra_info *extra) +{ + u32 len; + + ceph_decode_32_safe(p, end, extra->fscrypt_auth_len, bad); + if (extra->fscrypt_auth_len) { + ceph_decode_need(p, end, extra->fscrypt_auth_len, bad); + extra->fscrypt_auth = kmalloc(extra->fscrypt_auth_len, + GFP_KERNEL); + if (!extra->fscrypt_auth) + return -ENOMEM; + ceph_decode_copy_safe(p, end, extra->fscrypt_auth, + extra->fscrypt_auth_len, bad); + } + + ceph_decode_32_safe(p, end, len, bad); + if (len >= sizeof(u64)) { + ceph_decode_64_safe(p, end, extra->fscrypt_file_size, bad); + len -= sizeof(u64); + } + ceph_decode_skip_n(p, end, len, bad); + return 0; +bad: + return -EIO; +} +#else +static int parse_fscrypt_fields(void **p, void *end, + struct cap_extra_info *extra) +{ + u32 len; + + /* Don't care about these fields unless we're encryption-capable */ + ceph_decode_32_safe(p, end, len, bad); + if (len) + ceph_decode_skip_n(p, end, len, bad); + ceph_decode_32_safe(p, end, len, bad); + if (len) + ceph_decode_skip_n(p, end, len, bad); + return 0; +bad: + return -EIO; +} +#endif + /* * Handle a caps message from the MDS. * @@ -4105,6 +4247,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, dout("handle_caps from mds%d\n", session->s_mds); + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; + /* decode */ end = msg->front.iov_base + msg->front.iov_len; if (msg->front.iov_len < sizeof(*h)) @@ -4195,13 +4340,17 @@ void ceph_handle_caps(struct ceph_mds_session *session, ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad); } + if (msg_version >= 12) { + if (parse_fscrypt_fields(&p, end, &extra_info)) + goto bad; + } + /* lookup ino */ inode = ceph_find_inode(mdsc->fsc->sb, vino); dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, vino.snap, inode); mutex_lock(&session->s_mutex); - inc_session_sequence(session); dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, (unsigned)seq); @@ -4292,7 +4441,8 @@ void ceph_handle_caps(struct ceph_mds_session *session, break; case CEPH_CAP_OP_TRUNC: - queue_trunc = handle_cap_trunc(inode, h, session); + queue_trunc = handle_cap_trunc(inode, h, session, + &extra_info); spin_unlock(&ci->i_ceph_lock); if (queue_trunc) ceph_queue_vmtruncate(inode); @@ -4309,12 +4459,15 @@ done: done_unlocked: iput(inode); out: + ceph_dec_mds_stopping_blocker(mdsc); + ceph_put_string(extra_info.pool_ns); /* Defer closing the sessions after s_mutex lock being released */ if (close_sessions) ceph_mdsc_close_sessions(mdsc); + kfree(extra_info.fscrypt_auth); return; flush_cap_releases: @@ -4611,6 +4764,18 @@ int ceph_encode_inode_release(void **p, struct inode *inode, return ret; } +/** + * ceph_encode_dentry_release - encode a dentry release into an outgoing request + * @p: outgoing request buffer + * @dentry: dentry to release + * @dir: dir to release it from + * @mds: mds that we're speaking to + * @drop: caps being dropped + * @unless: unless we have these caps + * + * Encode a dentry release into an outgoing request buffer. Returns 1 if the + * thing was released, or a negative error code otherwise. + */ int ceph_encode_dentry_release(void **p, struct dentry *dentry, struct inode *dir, int mds, int drop, int unless) @@ -4643,13 +4808,25 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, if (ret && di->lease_session && di->lease_session->s_mds == mds) { dout("encode_dentry_release %p mds%d seq %d\n", dentry, mds, (int)di->lease_seq); - rel->dname_len = cpu_to_le32(dentry->d_name.len); - memcpy(*p, dentry->d_name.name, dentry->d_name.len); - *p += dentry->d_name.len; rel->dname_seq = cpu_to_le32(di->lease_seq); __ceph_mdsc_drop_dentry_lease(dentry); + spin_unlock(&dentry->d_lock); + if (IS_ENCRYPTED(dir) && fscrypt_has_encryption_key(dir)) { + int ret2 = ceph_encode_encrypted_fname(dir, dentry, *p); + + if (ret2 < 0) + return ret2; + + rel->dname_len = cpu_to_le32(ret2); + *p += ret2; + } else { + rel->dname_len = cpu_to_le32(dentry->d_name.len); + memcpy(*p, dentry->d_name.name, dentry->d_name.len); + *p += dentry->d_name.len; + } + } else { + spin_unlock(&dentry->d_lock); } - spin_unlock(&dentry->d_lock); return ret; } diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c new file mode 100644 index 000000000000..e4d5cd56a80b --- /dev/null +++ b/fs/ceph/crypto.c @@ -0,0 +1,673 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * The base64 encode/decode code was copied from fscrypt: + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility + * Written by Uday Savagaonkar, 2014. + * Modified by Jaegeuk Kim, 2015. + */ +#include <linux/ceph/ceph_debug.h> +#include <linux/xattr.h> +#include <linux/fscrypt.h> +#include <linux/ceph/striper.h> + +#include "super.h" +#include "mds_client.h" +#include "crypto.h" + +/* + * The base64url encoding used by fscrypt includes the '_' character, which may + * cause problems in snapshot names (which can not start with '_'). Thus, we + * used the base64 encoding defined for IMAP mailbox names (RFC 3501) instead, + * which replaces '-' and '_' by '+' and ','. + */ +static const char base64_table[65] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; + +int ceph_base64_encode(const u8 *src, int srclen, char *dst) +{ + u32 ac = 0; + int bits = 0; + int i; + char *cp = dst; + + for (i = 0; i < srclen; i++) { + ac = (ac << 8) | src[i]; + bits += 8; + do { + bits -= 6; + *cp++ = base64_table[(ac >> bits) & 0x3f]; + } while (bits >= 6); + } + if (bits) + *cp++ = base64_table[(ac << (6 - bits)) & 0x3f]; + return cp - dst; +} + +int ceph_base64_decode(const char *src, int srclen, u8 *dst) +{ + u32 ac = 0; + int bits = 0; + int i; + u8 *bp = dst; + + for (i = 0; i < srclen; i++) { + const char *p = strchr(base64_table, src[i]); + + if (p == NULL || src[i] == 0) + return -1; + ac = (ac << 6) | (p - base64_table); + bits += 6; + if (bits >= 8) { + bits -= 8; + *bp++ = (u8)(ac >> bits); + } + } + if (ac & ((1 << bits) - 1)) + return -1; + return bp - dst; +} + +static int ceph_crypt_get_context(struct inode *inode, void *ctx, size_t len) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fscrypt_auth *cfa = (struct ceph_fscrypt_auth *)ci->fscrypt_auth; + u32 ctxlen; + + /* Non existent or too short? */ + if (!cfa || (ci->fscrypt_auth_len < (offsetof(struct ceph_fscrypt_auth, cfa_blob) + 1))) + return -ENOBUFS; + + /* Some format we don't recognize? */ + if (le32_to_cpu(cfa->cfa_version) != CEPH_FSCRYPT_AUTH_VERSION) + return -ENOBUFS; + + ctxlen = le32_to_cpu(cfa->cfa_blob_len); + if (len < ctxlen) + return -ERANGE; + + memcpy(ctx, cfa->cfa_blob, ctxlen); + return ctxlen; +} + +static int ceph_crypt_set_context(struct inode *inode, const void *ctx, + size_t len, void *fs_data) +{ + int ret; + struct iattr attr = { }; + struct ceph_iattr cia = { }; + struct ceph_fscrypt_auth *cfa; + + WARN_ON_ONCE(fs_data); + + if (len > FSCRYPT_SET_CONTEXT_MAX_SIZE) + return -EINVAL; + + cfa = kzalloc(sizeof(*cfa), GFP_KERNEL); + if (!cfa) + return -ENOMEM; + + cfa->cfa_version = cpu_to_le32(CEPH_FSCRYPT_AUTH_VERSION); + cfa->cfa_blob_len = cpu_to_le32(len); + memcpy(cfa->cfa_blob, ctx, len); + + cia.fscrypt_auth = cfa; + + ret = __ceph_setattr(inode, &attr, &cia); + if (ret == 0) + inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED); + kfree(cia.fscrypt_auth); + return ret; +} + +static bool ceph_crypt_empty_dir(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + return ci->i_rsubdirs + ci->i_rfiles == 1; +} + +static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb) +{ + return ceph_sb_to_client(sb)->fsc_dummy_enc_policy.policy; +} + +static struct fscrypt_operations ceph_fscrypt_ops = { + .get_context = ceph_crypt_get_context, + .set_context = ceph_crypt_set_context, + .get_dummy_policy = ceph_get_dummy_policy, + .empty_dir = ceph_crypt_empty_dir, +}; + +void ceph_fscrypt_set_ops(struct super_block *sb) +{ + fscrypt_set_ops(sb, &ceph_fscrypt_ops); +} + +void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc) +{ + fscrypt_free_dummy_policy(&fsc->fsc_dummy_enc_policy); +} + +int ceph_fscrypt_prepare_context(struct inode *dir, struct inode *inode, + struct ceph_acl_sec_ctx *as) +{ + int ret, ctxsize; + bool encrypted = false; + struct ceph_inode_info *ci = ceph_inode(inode); + + ret = fscrypt_prepare_new_inode(dir, inode, &encrypted); + if (ret) + return ret; + if (!encrypted) + return 0; + + as->fscrypt_auth = kzalloc(sizeof(*as->fscrypt_auth), GFP_KERNEL); + if (!as->fscrypt_auth) + return -ENOMEM; + + ctxsize = fscrypt_context_for_new_inode(as->fscrypt_auth->cfa_blob, + inode); + if (ctxsize < 0) + return ctxsize; + + as->fscrypt_auth->cfa_version = cpu_to_le32(CEPH_FSCRYPT_AUTH_VERSION); + as->fscrypt_auth->cfa_blob_len = cpu_to_le32(ctxsize); + + WARN_ON_ONCE(ci->fscrypt_auth); + kfree(ci->fscrypt_auth); + ci->fscrypt_auth_len = ceph_fscrypt_auth_len(as->fscrypt_auth); + ci->fscrypt_auth = kmemdup(as->fscrypt_auth, ci->fscrypt_auth_len, + GFP_KERNEL); + if (!ci->fscrypt_auth) + return -ENOMEM; + + inode->i_flags |= S_ENCRYPTED; + + return 0; +} + +void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as) +{ + swap(req->r_fscrypt_auth, as->fscrypt_auth); +} + +/* + * User-created snapshots can't start with '_'. Snapshots that start with this + * character are special (hint: there aren't real snapshots) and use the + * following format: + * + * _<SNAPSHOT-NAME>_<INODE-NUMBER> + * + * where: + * - <SNAPSHOT-NAME> - the real snapshot name that may need to be decrypted, + * - <INODE-NUMBER> - the inode number (in decimal) for the actual snapshot + * + * This function parses these snapshot names and returns the inode + * <INODE-NUMBER>. 'name_len' will also bet set with the <SNAPSHOT-NAME> + * length. + */ +static struct inode *parse_longname(const struct inode *parent, + const char *name, int *name_len) +{ + struct inode *dir = NULL; + struct ceph_vino vino = { .snap = CEPH_NOSNAP }; + char *inode_number; + char *name_end; + int orig_len = *name_len; + int ret = -EIO; + + /* Skip initial '_' */ + name++; + name_end = strrchr(name, '_'); + if (!name_end) { + dout("Failed to parse long snapshot name: %s\n", name); + return ERR_PTR(-EIO); + } + *name_len = (name_end - name); + if (*name_len <= 0) { + pr_err("Failed to parse long snapshot name\n"); + return ERR_PTR(-EIO); + } + + /* Get the inode number */ + inode_number = kmemdup_nul(name_end + 1, + orig_len - *name_len - 2, + GFP_KERNEL); + if (!inode_number) + return ERR_PTR(-ENOMEM); + ret = kstrtou64(inode_number, 10, &vino.ino); + if (ret) { + dout("Failed to parse inode number: %s\n", name); + dir = ERR_PTR(ret); + goto out; + } + + /* And finally the inode */ + dir = ceph_find_inode(parent->i_sb, vino); + if (!dir) { + /* This can happen if we're not mounting cephfs on the root */ + dir = ceph_get_inode(parent->i_sb, vino, NULL); + if (!dir) + dir = ERR_PTR(-ENOENT); + } + if (IS_ERR(dir)) + dout("Can't find inode %s (%s)\n", inode_number, name); + +out: + kfree(inode_number); + return dir; +} + +int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name, + char *buf) +{ + struct inode *dir = parent; + struct qstr iname; + u32 len; + int name_len; + int elen; + int ret; + u8 *cryptbuf = NULL; + + iname.name = d_name->name; + name_len = d_name->len; + + /* Handle the special case of snapshot names that start with '_' */ + if ((ceph_snap(dir) == CEPH_SNAPDIR) && (name_len > 0) && + (iname.name[0] == '_')) { + dir = parse_longname(parent, iname.name, &name_len); + if (IS_ERR(dir)) + return PTR_ERR(dir); + iname.name++; /* skip initial '_' */ + } + iname.len = name_len; + + if (!fscrypt_has_encryption_key(dir)) { + memcpy(buf, d_name->name, d_name->len); + elen = d_name->len; + goto out; + } + + /* + * Convert cleartext d_name to ciphertext. If result is longer than + * CEPH_NOHASH_NAME_MAX, sha256 the remaining bytes + * + * See: fscrypt_setup_filename + */ + if (!fscrypt_fname_encrypted_size(dir, iname.len, NAME_MAX, &len)) { + elen = -ENAMETOOLONG; + goto out; + } + + /* Allocate a buffer appropriate to hold the result */ + cryptbuf = kmalloc(len > CEPH_NOHASH_NAME_MAX ? NAME_MAX : len, + GFP_KERNEL); + if (!cryptbuf) { + elen = -ENOMEM; + goto out; + } + + ret = fscrypt_fname_encrypt(dir, &iname, cryptbuf, len); + if (ret) { + elen = ret; + goto out; + } + + /* hash the end if the name is long enough */ + if (len > CEPH_NOHASH_NAME_MAX) { + u8 hash[SHA256_DIGEST_SIZE]; + u8 *extra = cryptbuf + CEPH_NOHASH_NAME_MAX; + + /* + * hash the extra bytes and overwrite crypttext beyond that + * point with it + */ + sha256(extra, len - CEPH_NOHASH_NAME_MAX, hash); + memcpy(extra, hash, SHA256_DIGEST_SIZE); + len = CEPH_NOHASH_NAME_MAX + SHA256_DIGEST_SIZE; + } + + /* base64 encode the encrypted name */ + elen = ceph_base64_encode(cryptbuf, len, buf); + dout("base64-encoded ciphertext name = %.*s\n", elen, buf); + + /* To understand the 240 limit, see CEPH_NOHASH_NAME_MAX comments */ + WARN_ON(elen > 240); + if ((elen > 0) && (dir != parent)) { + char tmp_buf[NAME_MAX]; + + elen = snprintf(tmp_buf, sizeof(tmp_buf), "_%.*s_%ld", + elen, buf, dir->i_ino); + memcpy(buf, tmp_buf, elen); + } + +out: + kfree(cryptbuf); + if (dir != parent) { + if ((dir->i_state & I_NEW)) + discard_new_inode(dir); + else + iput(dir); + } + return elen; +} + +int ceph_encode_encrypted_fname(struct inode *parent, struct dentry *dentry, + char *buf) +{ + WARN_ON_ONCE(!fscrypt_has_encryption_key(parent)); + + return ceph_encode_encrypted_dname(parent, &dentry->d_name, buf); +} + +/** + * ceph_fname_to_usr - convert a filename for userland presentation + * @fname: ceph_fname to be converted + * @tname: temporary name buffer to use for conversion (may be NULL) + * @oname: where converted name should be placed + * @is_nokey: set to true if key wasn't available during conversion (may be NULL) + * + * Given a filename (usually from the MDS), format it for presentation to + * userland. If @parent is not encrypted, just pass it back as-is. + * + * Otherwise, base64 decode the string, and then ask fscrypt to format it + * for userland presentation. + * + * Returns 0 on success or negative error code on error. + */ +int ceph_fname_to_usr(const struct ceph_fname *fname, struct fscrypt_str *tname, + struct fscrypt_str *oname, bool *is_nokey) +{ + struct inode *dir = fname->dir; + struct fscrypt_str _tname = FSTR_INIT(NULL, 0); + struct fscrypt_str iname; + char *name = fname->name; + int name_len = fname->name_len; + int ret; + + /* Sanity check that the resulting name will fit in the buffer */ + if (fname->name_len > NAME_MAX || fname->ctext_len > NAME_MAX) + return -EIO; + + /* Handle the special case of snapshot names that start with '_' */ + if ((ceph_snap(dir) == CEPH_SNAPDIR) && (name_len > 0) && + (name[0] == '_')) { + dir = parse_longname(dir, name, &name_len); + if (IS_ERR(dir)) + return PTR_ERR(dir); + name++; /* skip initial '_' */ + } + + if (!IS_ENCRYPTED(dir)) { + oname->name = fname->name; + oname->len = fname->name_len; + ret = 0; + goto out_inode; + } + + ret = ceph_fscrypt_prepare_readdir(dir); + if (ret) + goto out_inode; + + /* + * Use the raw dentry name as sent by the MDS instead of + * generating a nokey name via fscrypt. + */ + if (!fscrypt_has_encryption_key(dir)) { + if (fname->no_copy) + oname->name = fname->name; + else + memcpy(oname->name, fname->name, fname->name_len); + oname->len = fname->name_len; + if (is_nokey) + *is_nokey = true; + ret = 0; + goto out_inode; + } + + if (fname->ctext_len == 0) { + int declen; + + if (!tname) { + ret = fscrypt_fname_alloc_buffer(NAME_MAX, &_tname); + if (ret) + goto out_inode; + tname = &_tname; + } + + declen = ceph_base64_decode(name, name_len, tname->name); + if (declen <= 0) { + ret = -EIO; + goto out; + } + iname.name = tname->name; + iname.len = declen; + } else { + iname.name = fname->ctext; + iname.len = fname->ctext_len; + } + + ret = fscrypt_fname_disk_to_usr(dir, 0, 0, &iname, oname); + if (!ret && (dir != fname->dir)) { + char tmp_buf[CEPH_BASE64_CHARS(NAME_MAX)]; + + name_len = snprintf(tmp_buf, sizeof(tmp_buf), "_%.*s_%ld", + oname->len, oname->name, dir->i_ino); + memcpy(oname->name, tmp_buf, name_len); + oname->len = name_len; + } + +out: + fscrypt_fname_free_buffer(&_tname); +out_inode: + if ((dir != fname->dir) && !IS_ERR(dir)) { + if ((dir->i_state & I_NEW)) + discard_new_inode(dir); + else + iput(dir); + } + return ret; +} + +/** + * ceph_fscrypt_prepare_readdir - simple __fscrypt_prepare_readdir() wrapper + * @dir: directory inode for readdir prep + * + * Simple wrapper around __fscrypt_prepare_readdir() that will mark directory as + * non-complete if this call results in having the directory unlocked. + * + * Returns: + * 1 - if directory was locked and key is now loaded (i.e. dir is unlocked) + * 0 - if directory is still locked + * < 0 - if __fscrypt_prepare_readdir() fails + */ +int ceph_fscrypt_prepare_readdir(struct inode *dir) +{ + bool had_key = fscrypt_has_encryption_key(dir); + int err; + + if (!IS_ENCRYPTED(dir)) + return 0; + + err = __fscrypt_prepare_readdir(dir); + if (err) + return err; + if (!had_key && fscrypt_has_encryption_key(dir)) { + /* directory just got unlocked, mark it as not complete */ + ceph_dir_clear_complete(dir); + return 1; + } + return 0; +} + +int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num) +{ + dout("%s: len %u offs %u blk %llu\n", __func__, len, offs, lblk_num); + return fscrypt_decrypt_block_inplace(inode, page, len, offs, lblk_num); +} + +int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num, + gfp_t gfp_flags) +{ + dout("%s: len %u offs %u blk %llu\n", __func__, len, offs, lblk_num); + return fscrypt_encrypt_block_inplace(inode, page, len, offs, lblk_num, + gfp_flags); +} + +/** + * ceph_fscrypt_decrypt_pages - decrypt an array of pages + * @inode: pointer to inode associated with these pages + * @page: pointer to page array + * @off: offset into the file that the read data starts + * @len: max length to decrypt + * + * Decrypt an array of fscrypt'ed pages and return the amount of + * data decrypted. Any data in the page prior to the start of the + * first complete block in the read is ignored. Any incomplete + * crypto blocks at the end of the array are ignored (and should + * probably be zeroed by the caller). + * + * Returns the length of the decrypted data or a negative errno. + */ +int ceph_fscrypt_decrypt_pages(struct inode *inode, struct page **page, + u64 off, int len) +{ + int i, num_blocks; + u64 baseblk = off >> CEPH_FSCRYPT_BLOCK_SHIFT; + int ret = 0; + + /* + * We can't deal with partial blocks on an encrypted file, so mask off + * the last bit. + */ + num_blocks = ceph_fscrypt_blocks(off, len & CEPH_FSCRYPT_BLOCK_MASK); + + /* Decrypt each block */ + for (i = 0; i < num_blocks; ++i) { + int blkoff = i << CEPH_FSCRYPT_BLOCK_SHIFT; + int pgidx = blkoff >> PAGE_SHIFT; + unsigned int pgoffs = offset_in_page(blkoff); + int fret; + + fret = ceph_fscrypt_decrypt_block_inplace(inode, page[pgidx], + CEPH_FSCRYPT_BLOCK_SIZE, pgoffs, + baseblk + i); + if (fret < 0) { + if (ret == 0) + ret = fret; + break; + } + ret += CEPH_FSCRYPT_BLOCK_SIZE; + } + return ret; +} + +/** + * ceph_fscrypt_decrypt_extents: decrypt received extents in given buffer + * @inode: inode associated with pages being decrypted + * @page: pointer to page array + * @off: offset into the file that the data in page[0] starts + * @map: pointer to extent array + * @ext_cnt: length of extent array + * + * Given an extent map and a page array, decrypt the received data in-place, + * skipping holes. Returns the offset into buffer of end of last decrypted + * block. + */ +int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page, + u64 off, struct ceph_sparse_extent *map, + u32 ext_cnt) +{ + int i, ret = 0; + struct ceph_inode_info *ci = ceph_inode(inode); + u64 objno, objoff; + u32 xlen; + + /* Nothing to do for empty array */ + if (ext_cnt == 0) { + dout("%s: empty array, ret 0\n", __func__); + return 0; + } + + ceph_calc_file_object_mapping(&ci->i_layout, off, map[0].len, + &objno, &objoff, &xlen); + + for (i = 0; i < ext_cnt; ++i) { + struct ceph_sparse_extent *ext = &map[i]; + int pgsoff = ext->off - objoff; + int pgidx = pgsoff >> PAGE_SHIFT; + int fret; + + if ((ext->off | ext->len) & ~CEPH_FSCRYPT_BLOCK_MASK) { + pr_warn("%s: bad encrypted sparse extent idx %d off %llx len %llx\n", + __func__, i, ext->off, ext->len); + return -EIO; + } + fret = ceph_fscrypt_decrypt_pages(inode, &page[pgidx], + off + pgsoff, ext->len); + dout("%s: [%d] 0x%llx~0x%llx fret %d\n", __func__, i, + ext->off, ext->len, fret); + if (fret < 0) { + if (ret == 0) + ret = fret; + break; + } + ret = pgsoff + fret; + } + dout("%s: ret %d\n", __func__, ret); + return ret; +} + +/** + * ceph_fscrypt_encrypt_pages - encrypt an array of pages + * @inode: pointer to inode associated with these pages + * @page: pointer to page array + * @off: offset into the file that the data starts + * @len: max length to encrypt + * @gfp: gfp flags to use for allocation + * + * Decrypt an array of cleartext pages and return the amount of + * data encrypted. Any data in the page prior to the start of the + * first complete block in the read is ignored. Any incomplete + * crypto blocks at the end of the array are ignored. + * + * Returns the length of the encrypted data or a negative errno. + */ +int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off, + int len, gfp_t gfp) +{ + int i, num_blocks; + u64 baseblk = off >> CEPH_FSCRYPT_BLOCK_SHIFT; + int ret = 0; + + /* + * We can't deal with partial blocks on an encrypted file, so mask off + * the last bit. + */ + num_blocks = ceph_fscrypt_blocks(off, len & CEPH_FSCRYPT_BLOCK_MASK); + + /* Encrypt each block */ + for (i = 0; i < num_blocks; ++i) { + int blkoff = i << CEPH_FSCRYPT_BLOCK_SHIFT; + int pgidx = blkoff >> PAGE_SHIFT; + unsigned int pgoffs = offset_in_page(blkoff); + int fret; + + fret = ceph_fscrypt_encrypt_block_inplace(inode, page[pgidx], + CEPH_FSCRYPT_BLOCK_SIZE, pgoffs, + baseblk + i, gfp); + if (fret < 0) { + if (ret == 0) + ret = fret; + break; + } + ret += CEPH_FSCRYPT_BLOCK_SIZE; + } + return ret; +} diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h new file mode 100644 index 000000000000..47e0c319fc68 --- /dev/null +++ b/fs/ceph/crypto.h @@ -0,0 +1,288 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Ceph fscrypt functionality + */ + +#ifndef _CEPH_CRYPTO_H +#define _CEPH_CRYPTO_H + +#include <crypto/sha2.h> +#include <linux/fscrypt.h> + +#define CEPH_FSCRYPT_BLOCK_SHIFT 12 +#define CEPH_FSCRYPT_BLOCK_SIZE (_AC(1, UL) << CEPH_FSCRYPT_BLOCK_SHIFT) +#define CEPH_FSCRYPT_BLOCK_MASK (~(CEPH_FSCRYPT_BLOCK_SIZE-1)) + +struct ceph_fs_client; +struct ceph_acl_sec_ctx; +struct ceph_mds_request; + +struct ceph_fname { + struct inode *dir; + char *name; // b64 encoded, possibly hashed + unsigned char *ctext; // binary crypttext (if any) + u32 name_len; // length of name buffer + u32 ctext_len; // length of crypttext + bool no_copy; +}; + +/* + * Header for the crypted file when truncating the size, this + * will be sent to MDS, and the MDS will update the encrypted + * last block and then truncate the size. + */ +struct ceph_fscrypt_truncate_size_header { + __u8 ver; + __u8 compat; + + /* + * It will be sizeof(assert_ver + file_offset + block_size) + * if the last block is empty when it's located in a file + * hole. Or the data_len will plus CEPH_FSCRYPT_BLOCK_SIZE. + */ + __le32 data_len; + + __le64 change_attr; + __le64 file_offset; + __le32 block_size; +} __packed; + +struct ceph_fscrypt_auth { + __le32 cfa_version; + __le32 cfa_blob_len; + u8 cfa_blob[FSCRYPT_SET_CONTEXT_MAX_SIZE]; +} __packed; + +#define CEPH_FSCRYPT_AUTH_VERSION 1 +static inline u32 ceph_fscrypt_auth_len(struct ceph_fscrypt_auth *fa) +{ + u32 ctxsize = le32_to_cpu(fa->cfa_blob_len); + + return offsetof(struct ceph_fscrypt_auth, cfa_blob) + ctxsize; +} + +#ifdef CONFIG_FS_ENCRYPTION +/* + * We want to encrypt filenames when creating them, but the encrypted + * versions of those names may have illegal characters in them. To mitigate + * that, we base64 encode them, but that gives us a result that can exceed + * NAME_MAX. + * + * Follow a similar scheme to fscrypt itself, and cap the filename to a + * smaller size. If the ciphertext name is longer than the value below, then + * sha256 hash the remaining bytes. + * + * For the fscrypt_nokey_name struct the dirhash[2] member is useless in ceph + * so the corresponding struct will be: + * + * struct fscrypt_ceph_nokey_name { + * u8 bytes[157]; + * u8 sha256[SHA256_DIGEST_SIZE]; + * }; // 180 bytes => 240 bytes base64-encoded, which is <= NAME_MAX (255) + * + * (240 bytes is the maximum size allowed for snapshot names to take into + * account the format: '_<SNAPSHOT-NAME>_<INODE-NUMBER>'.) + * + * Note that for long names that end up having their tail portion hashed, we + * must also store the full encrypted name (in the dentry's alternate_name + * field). + */ +#define CEPH_NOHASH_NAME_MAX (180 - SHA256_DIGEST_SIZE) + +#define CEPH_BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) + +int ceph_base64_encode(const u8 *src, int srclen, char *dst); +int ceph_base64_decode(const char *src, int srclen, u8 *dst); + +void ceph_fscrypt_set_ops(struct super_block *sb); + +void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc); + +int ceph_fscrypt_prepare_context(struct inode *dir, struct inode *inode, + struct ceph_acl_sec_ctx *as); +void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as); +int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name, + char *buf); +int ceph_encode_encrypted_fname(struct inode *parent, struct dentry *dentry, + char *buf); + +static inline int ceph_fname_alloc_buffer(struct inode *parent, + struct fscrypt_str *fname) +{ + if (!IS_ENCRYPTED(parent)) + return 0; + return fscrypt_fname_alloc_buffer(NAME_MAX, fname); +} + +static inline void ceph_fname_free_buffer(struct inode *parent, + struct fscrypt_str *fname) +{ + if (IS_ENCRYPTED(parent)) + fscrypt_fname_free_buffer(fname); +} + +int ceph_fname_to_usr(const struct ceph_fname *fname, struct fscrypt_str *tname, + struct fscrypt_str *oname, bool *is_nokey); +int ceph_fscrypt_prepare_readdir(struct inode *dir); + +static inline unsigned int ceph_fscrypt_blocks(u64 off, u64 len) +{ + /* crypto blocks cannot span more than one page */ + BUILD_BUG_ON(CEPH_FSCRYPT_BLOCK_SHIFT > PAGE_SHIFT); + + return ((off+len+CEPH_FSCRYPT_BLOCK_SIZE-1) >> CEPH_FSCRYPT_BLOCK_SHIFT) - + (off >> CEPH_FSCRYPT_BLOCK_SHIFT); +} + +/* + * If we have an encrypted inode then we must adjust the offset and + * range of the on-the-wire read to cover an entire encryption block. + * The copy will be done using the original offset and length, after + * we've decrypted the result. + */ +static inline void ceph_fscrypt_adjust_off_and_len(struct inode *inode, + u64 *off, u64 *len) +{ + if (IS_ENCRYPTED(inode)) { + *len = ceph_fscrypt_blocks(*off, *len) * CEPH_FSCRYPT_BLOCK_SIZE; + *off &= CEPH_FSCRYPT_BLOCK_MASK; + } +} + +int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num); +int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num, + gfp_t gfp_flags); +int ceph_fscrypt_decrypt_pages(struct inode *inode, struct page **page, + u64 off, int len); +int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page, + u64 off, struct ceph_sparse_extent *map, + u32 ext_cnt); +int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off, + int len, gfp_t gfp); + +static inline struct page *ceph_fscrypt_pagecache_page(struct page *page) +{ + return fscrypt_is_bounce_page(page) ? fscrypt_pagecache_page(page) : page; +} + +#else /* CONFIG_FS_ENCRYPTION */ + +static inline void ceph_fscrypt_set_ops(struct super_block *sb) +{ +} + +static inline void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc) +{ +} + +static inline int ceph_fscrypt_prepare_context(struct inode *dir, + struct inode *inode, + struct ceph_acl_sec_ctx *as) +{ + if (IS_ENCRYPTED(dir)) + return -EOPNOTSUPP; + return 0; +} + +static inline void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as_ctx) +{ +} + +static inline int ceph_encode_encrypted_dname(struct inode *parent, + struct qstr *d_name, char *buf) +{ + memcpy(buf, d_name->name, d_name->len); + return d_name->len; +} + +static inline int ceph_encode_encrypted_fname(struct inode *parent, + struct dentry *dentry, char *buf) +{ + return -EOPNOTSUPP; +} + +static inline int ceph_fname_alloc_buffer(struct inode *parent, + struct fscrypt_str *fname) +{ + return 0; +} + +static inline void ceph_fname_free_buffer(struct inode *parent, + struct fscrypt_str *fname) +{ +} + +static inline int ceph_fname_to_usr(const struct ceph_fname *fname, + struct fscrypt_str *tname, + struct fscrypt_str *oname, bool *is_nokey) +{ + oname->name = fname->name; + oname->len = fname->name_len; + return 0; +} + +static inline int ceph_fscrypt_prepare_readdir(struct inode *dir) +{ + return 0; +} + +static inline void ceph_fscrypt_adjust_off_and_len(struct inode *inode, + u64 *off, u64 *len) +{ +} + +static inline int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num) +{ + return 0; +} + +static inline int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num, + gfp_t gfp_flags) +{ + return 0; +} + +static inline int ceph_fscrypt_decrypt_pages(struct inode *inode, + struct page **page, u64 off, + int len) +{ + return 0; +} + +static inline int ceph_fscrypt_decrypt_extents(struct inode *inode, + struct page **page, u64 off, + struct ceph_sparse_extent *map, + u32 ext_cnt) +{ + return 0; +} + +static inline int ceph_fscrypt_encrypt_pages(struct inode *inode, + struct page **page, u64 off, + int len, gfp_t gfp) +{ + return 0; +} + +static inline struct page *ceph_fscrypt_pagecache_page(struct page *page) +{ + return page; +} +#endif /* CONFIG_FS_ENCRYPTION */ + +static inline loff_t ceph_fscrypt_page_offset(struct page *page) +{ + return page_offset(ceph_fscrypt_pagecache_page(page)); +} + +#endif /* _CEPH_CRYPTO_H */ diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index bdcffb04513f..854cbdd66661 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -9,6 +9,7 @@ #include "super.h" #include "mds_client.h" +#include "crypto.h" /* * Directory operations: readdir, lookup, create, link, unlink, @@ -241,7 +242,9 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, di = ceph_dentry(dentry); if (d_unhashed(dentry) || d_really_is_negative(dentry) || - di->lease_shared_gen != shared_gen) { + di->lease_shared_gen != shared_gen || + ((dentry->d_flags & DCACHE_NOKEY_NAME) && + fscrypt_has_encryption_key(dir))) { spin_unlock(&dentry->d_lock); dput(dentry); err = -EAGAIN; @@ -340,6 +343,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ctx->pos = 2; } + err = ceph_fscrypt_prepare_readdir(inode); + if (err < 0) + return err; + spin_lock(&ci->i_ceph_lock); /* request Fx cap. if have Fx, we don't need to release Fs cap * for later create/unlink. */ @@ -389,6 +396,7 @@ more: req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); + err = ceph_alloc_readdir_reply_buffer(req, inode); if (err) { ceph_mdsc_put_request(req); @@ -402,11 +410,21 @@ more: req->r_inode_drop = CEPH_CAP_FILE_EXCL; } if (dfi->last_name) { - req->r_path2 = kstrdup(dfi->last_name, GFP_KERNEL); + struct qstr d_name = { .name = dfi->last_name, + .len = strlen(dfi->last_name) }; + + req->r_path2 = kzalloc(NAME_MAX + 1, GFP_KERNEL); if (!req->r_path2) { ceph_mdsc_put_request(req); return -ENOMEM; } + + err = ceph_encode_encrypted_dname(inode, &d_name, + req->r_path2); + if (err < 0) { + ceph_mdsc_put_request(req); + return err; + } } else if (is_hash_order(ctx->pos)) { req->r_args.readdir.offset_hash = cpu_to_le32(fpos_hash(ctx->pos)); @@ -511,15 +529,20 @@ more: for (; i < rinfo->dir_nr; i++) { struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; - BUG_ON(rde->offset < ctx->pos); + if (rde->offset < ctx->pos) { + pr_warn("%s: rde->offset 0x%llx ctx->pos 0x%llx\n", + __func__, rde->offset, ctx->pos); + return -EIO; + } + + if (WARN_ON_ONCE(!rde->inode.in)) + return -EIO; ctx->pos = rde->offset; dout("readdir (%d/%d) -> %llx '%.*s' %p\n", i, rinfo->dir_nr, ctx->pos, rde->name_len, rde->name, &rde->inode.in); - BUG_ON(!rde->inode.in); - if (!dir_emit(ctx, rde->name, rde->name_len, ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)), le32_to_cpu(rde->inode.in->mode) >> 12)) { @@ -532,6 +555,8 @@ more: dout("filldir stopping us...\n"); return 0; } + + /* Reset the lengths to their original allocated vals */ ctx->pos++; } @@ -586,7 +611,6 @@ more: dfi->dir_ordered_count); spin_unlock(&ci->i_ceph_lock); } - dout("readdir %p file %p done.\n", inode, file); return 0; } @@ -760,6 +784,18 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); + if (IS_ENCRYPTED(dir)) { + bool had_key = fscrypt_has_encryption_key(dir); + + err = fscrypt_prepare_lookup_partial(dir, dentry); + if (err < 0) + return ERR_PTR(err); + + /* mark directory as incomplete if it has been unlocked */ + if (!had_key && fscrypt_has_encryption_key(dir)) + ceph_dir_clear_complete(dir); + } + /* can we conclude ENOENT locally? */ if (d_really_is_negative(dentry)) { struct ceph_inode_info *ci = ceph_inode(dir); @@ -865,13 +901,6 @@ static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir, goto out; } - err = ceph_pre_init_acls(dir, &mode, &as_ctx); - if (err < 0) - goto out; - err = ceph_security_init_secctx(dentry, mode, &as_ctx); - if (err < 0) - goto out; - dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n", dir, dentry, mode, rdev); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS); @@ -879,6 +908,17 @@ static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir, err = PTR_ERR(req); goto out; } + + req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx); + if (IS_ERR(req->r_new_inode)) { + err = PTR_ERR(req->r_new_inode); + req->r_new_inode = NULL; + goto out_req; + } + + if (S_ISREG(mode) && IS_ENCRYPTED(dir)) + set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); + req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_parent = dir; @@ -889,13 +929,13 @@ static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir, req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; - if (as_ctx.pagelist) { - req->r_pagelist = as_ctx.pagelist; - as_ctx.pagelist = NULL; - } + + ceph_as_ctx_to_req(req, &as_ctx); + err = ceph_mdsc_do_request(mdsc, dir, req); if (!err && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); +out_req: ceph_mdsc_put_request(req); out: if (!err) @@ -912,12 +952,50 @@ static int ceph_create(struct mnt_idmap *idmap, struct inode *dir, return ceph_mknod(idmap, dir, dentry, mode, 0); } +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +static int prep_encrypted_symlink_target(struct ceph_mds_request *req, + const char *dest) +{ + int err; + int len = strlen(dest); + struct fscrypt_str osd_link = FSTR_INIT(NULL, 0); + + err = fscrypt_prepare_symlink(req->r_parent, dest, len, PATH_MAX, + &osd_link); + if (err) + goto out; + + err = fscrypt_encrypt_symlink(req->r_new_inode, dest, len, &osd_link); + if (err) + goto out; + + req->r_path2 = kmalloc(CEPH_BASE64_CHARS(osd_link.len) + 1, GFP_KERNEL); + if (!req->r_path2) { + err = -ENOMEM; + goto out; + } + + len = ceph_base64_encode(osd_link.name, osd_link.len, req->r_path2); + req->r_path2[len] = '\0'; +out: + fscrypt_fname_free_buffer(&osd_link); + return err; +} +#else +static int prep_encrypted_symlink_target(struct ceph_mds_request *req, + const char *dest) +{ + return -EOPNOTSUPP; +} +#endif + static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *dest) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; struct ceph_acl_sec_ctx as_ctx = {}; + umode_t mode = S_IFLNK | 0777; int err; if (ceph_snap(dir) != CEPH_NOSNAP) @@ -932,38 +1010,48 @@ static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir, goto out; } - err = ceph_security_init_secctx(dentry, S_IFLNK | 0777, &as_ctx); - if (err < 0) - goto out; - dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; } - req->r_path2 = kstrdup(dest, GFP_KERNEL); - if (!req->r_path2) { - err = -ENOMEM; - ceph_mdsc_put_request(req); - goto out; + + req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx); + if (IS_ERR(req->r_new_inode)) { + err = PTR_ERR(req->r_new_inode); + req->r_new_inode = NULL; + goto out_req; } + req->r_parent = dir; ihold(dir); + if (IS_ENCRYPTED(req->r_new_inode)) { + err = prep_encrypted_symlink_target(req, dest); + if (err) + goto out_req; + } else { + req->r_path2 = kstrdup(dest, GFP_KERNEL); + if (!req->r_path2) { + err = -ENOMEM; + goto out_req; + } + } + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; - if (as_ctx.pagelist) { - req->r_pagelist = as_ctx.pagelist; - as_ctx.pagelist = NULL; - } + + ceph_as_ctx_to_req(req, &as_ctx); + err = ceph_mdsc_do_request(mdsc, dir, req); if (!err && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); +out_req: ceph_mdsc_put_request(req); out: if (err) @@ -1003,14 +1091,12 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, err = -EDQUOT; goto out; } - - mode |= S_IFDIR; - err = ceph_pre_init_acls(dir, &mode, &as_ctx); - if (err < 0) - goto out; - err = ceph_security_init_secctx(dentry, mode, &as_ctx); - if (err < 0) + if ((op == CEPH_MDS_OP_MKSNAP) && IS_ENCRYPTED(dir) && + !fscrypt_has_encryption_key(dir)) { + err = -ENOKEY; goto out; + } + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) { @@ -1018,6 +1104,14 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, goto out; } + mode |= S_IFDIR; + req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx); + if (IS_ERR(req->r_new_inode)) { + err = PTR_ERR(req->r_new_inode); + req->r_new_inode = NULL; + goto out_req; + } + req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_parent = dir; @@ -1027,15 +1121,15 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; - if (as_ctx.pagelist) { - req->r_pagelist = as_ctx.pagelist; - as_ctx.pagelist = NULL; - } + + ceph_as_ctx_to_req(req, &as_ctx); + err = ceph_mdsc_do_request(mdsc, dir, req); if (!err && !req->r_reply_info.head->is_target && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); +out_req: ceph_mdsc_put_request(req); out: if (!err) @@ -1063,6 +1157,10 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; + err = fscrypt_prepare_link(old_dentry, dir, dentry); + if (err) + return err; + dout("link in dir %p %llx.%llx old_dentry %p:'%pd' dentry %p:'%pd'\n", dir, ceph_vinop(dir), old_dentry, old_dentry, dentry, dentry); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS); @@ -1310,6 +1408,11 @@ static int ceph_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (err) return err; + err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, + flags); + if (err) + return err; + dout("rename dir %p dentry %p to dir %p dentry %p\n", old_dir, old_dentry, new_dir, new_dentry); req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); @@ -1765,6 +1868,10 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) struct inode *dir, *inode; struct ceph_mds_client *mdsc; + valid = fscrypt_d_revalidate(dentry, flags); + if (valid <= 0) + return valid; + if (flags & LOOKUP_RCU) { parent = READ_ONCE(dentry->d_parent); dir = d_inode_rcu(parent); @@ -1777,8 +1884,9 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) inode = d_inode(dentry); } - dout("d_revalidate %p '%pd' inode %p offset 0x%llx\n", dentry, - dentry, inode, ceph_dentry(dentry)->offset); + dout("d_revalidate %p '%pd' inode %p offset 0x%llx nokey %d\n", dentry, + dentry, inode, ceph_dentry(dentry)->offset, + !!(dentry->d_flags & DCACHE_NOKEY_NAME)); mdsc = ceph_sb_to_client(dir->i_sb)->mdsc; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index f780e4e0d062..8559990a59a5 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -7,6 +7,7 @@ #include "super.h" #include "mds_client.h" +#include "crypto.h" /* * Basic fh @@ -535,7 +536,9 @@ static int ceph_get_name(struct dentry *parent, char *name, { struct ceph_mds_client *mdsc; struct ceph_mds_request *req; + struct inode *dir = d_inode(parent); struct inode *inode = d_inode(child); + struct ceph_mds_reply_info_parsed *rinfo; int err; if (ceph_snap(inode) != CEPH_NOSNAP) @@ -547,30 +550,47 @@ static int ceph_get_name(struct dentry *parent, char *name, if (IS_ERR(req)) return PTR_ERR(req); - inode_lock(d_inode(parent)); - + inode_lock(dir); req->r_inode = inode; ihold(inode); req->r_ino2 = ceph_vino(d_inode(parent)); - req->r_parent = d_inode(parent); - ihold(req->r_parent); + req->r_parent = dir; + ihold(dir); set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); + inode_unlock(dir); - inode_unlock(d_inode(parent)); + if (err) + goto out; - if (!err) { - struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + rinfo = &req->r_reply_info; + if (!IS_ENCRYPTED(dir)) { memcpy(name, rinfo->dname, rinfo->dname_len); name[rinfo->dname_len] = 0; - dout("get_name %p ino %llx.%llx name %s\n", - child, ceph_vinop(inode), name); } else { - dout("get_name %p ino %llx.%llx err %d\n", - child, ceph_vinop(inode), err); - } + struct fscrypt_str oname = FSTR_INIT(NULL, 0); + struct ceph_fname fname = { .dir = dir, + .name = rinfo->dname, + .ctext = rinfo->altname, + .name_len = rinfo->dname_len, + .ctext_len = rinfo->altname_len }; + + err = ceph_fname_alloc_buffer(dir, &oname); + if (err < 0) + goto out; + err = ceph_fname_to_usr(&fname, NULL, &oname, NULL); + if (!err) { + memcpy(name, oname.name, oname.len); + name[oname.len] = 0; + } + ceph_fname_free_buffer(dir, &oname); + } +out: + dout("get_name %p ino %llx.%llx err %d %s%s\n", + child, ceph_vinop(inode), err, + err ? "" : "name ", err ? "" : name); ceph_mdsc_put_request(req); return err; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 63efe5389783..b1da02f5dbe3 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -366,8 +366,13 @@ int ceph_open(struct inode *inode, struct file *file) /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */ flags = file->f_flags & ~(O_CREAT|O_EXCL); - if (S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode)) { flags = O_DIRECTORY; /* mds likes to know */ + } else if (S_ISREG(inode->i_mode)) { + err = fscrypt_file_open(inode, file); + if (err) + return err; + } dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode, ceph_vinop(inode), file, flags, file->f_flags); @@ -604,7 +609,8 @@ out: ceph_mdsc_release_dir_caps(req); } -static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, +static int ceph_finish_async_create(struct inode *dir, struct inode *inode, + struct dentry *dentry, struct file *file, umode_t mode, struct ceph_mds_request *req, struct ceph_acl_sec_ctx *as_ctx, @@ -616,7 +622,6 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, struct ceph_mds_reply_info_in iinfo = { .in = &in }; struct ceph_inode_info *ci = ceph_inode(dir); struct ceph_dentry_info *di = ceph_dentry(dentry); - struct inode *inode; struct timespec64 now; struct ceph_string *pool_ns; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); @@ -625,10 +630,6 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, ktime_get_real_ts64(&now); - inode = ceph_get_inode(dentry->d_sb, vino); - if (IS_ERR(inode)) - return PTR_ERR(inode); - iinfo.inline_version = CEPH_INLINE_NONE; iinfo.change_attr = 1; ceph_encode_timespec64(&iinfo.btime, &now); @@ -686,8 +687,7 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, ceph_dir_clear_complete(dir); if (!d_unhashed(dentry)) d_drop(dentry); - if (inode->i_state & I_NEW) - discard_new_inode(inode); + discard_new_inode(inode); } else { struct dentry *dn; @@ -733,6 +733,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; + struct inode *new_inode = NULL; struct dentry *dn; struct ceph_acl_sec_ctx as_ctx = {}; bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS); @@ -755,15 +756,16 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, */ flags &= ~O_TRUNC; +retry: if (flags & O_CREAT) { if (ceph_quota_is_max_files_exceeded(dir)) return -EDQUOT; - err = ceph_pre_init_acls(dir, &mode, &as_ctx); - if (err < 0) - return err; - err = ceph_security_init_secctx(dentry, mode, &as_ctx); - if (err < 0) + + new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx); + if (IS_ERR(new_inode)) { + err = PTR_ERR(new_inode); goto out_ctx; + } /* Async create can't handle more than a page of xattrs */ if (as_ctx.pagelist && !list_is_singular(&as_ctx.pagelist->head)) @@ -772,7 +774,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, /* If it's not being looked up, it's negative */ return -ENOENT; } -retry: + /* do the open */ req = prepare_open_request(dir->i_sb, flags, mode); if (IS_ERR(req)) { @@ -787,6 +789,12 @@ retry: req->r_args.open.mask = cpu_to_le32(mask); req->r_parent = dir; ihold(dir); + if (IS_ENCRYPTED(dir)) { + set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); + err = fscrypt_prepare_lookup_partial(dir, dentry); + if (err < 0) + goto out_req; + } if (flags & O_CREAT) { struct ceph_file_layout lo; @@ -794,32 +802,47 @@ retry: req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; - if (as_ctx.pagelist) { - req->r_pagelist = as_ctx.pagelist; - as_ctx.pagelist = NULL; - } - if (try_async && - (req->r_dir_caps = - try_prep_async_create(dir, dentry, &lo, - &req->r_deleg_ino))) { + + ceph_as_ctx_to_req(req, &as_ctx); + + if (try_async && (req->r_dir_caps = + try_prep_async_create(dir, dentry, &lo, + &req->r_deleg_ino))) { + struct ceph_vino vino = { .ino = req->r_deleg_ino, + .snap = CEPH_NOSNAP }; struct ceph_dentry_info *di = ceph_dentry(dentry); set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL); req->r_callback = ceph_async_create_cb; + /* Hash inode before RPC */ + new_inode = ceph_get_inode(dir->i_sb, vino, new_inode); + if (IS_ERR(new_inode)) { + err = PTR_ERR(new_inode); + new_inode = NULL; + goto out_req; + } + WARN_ON_ONCE(!(new_inode->i_state & I_NEW)); + spin_lock(&dentry->d_lock); di->flags |= CEPH_DENTRY_ASYNC_CREATE; spin_unlock(&dentry->d_lock); err = ceph_mdsc_submit_request(mdsc, dir, req); if (!err) { - err = ceph_finish_async_create(dir, dentry, - file, mode, req, - &as_ctx, &lo); + err = ceph_finish_async_create(dir, new_inode, + dentry, file, + mode, req, + &as_ctx, &lo); + new_inode = NULL; } else if (err == -EJUKEBOX) { restore_deleg_ino(dir, req->r_deleg_ino); ceph_mdsc_put_request(req); + discard_new_inode(new_inode); + ceph_release_acl_sec_ctx(&as_ctx); + memset(&as_ctx, 0, sizeof(as_ctx)); + new_inode = NULL; try_async = false; ceph_put_string(rcu_dereference_raw(lo.pool_ns)); goto retry; @@ -830,6 +853,8 @@ retry: } set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + req->r_new_inode = new_inode; + new_inode = NULL; err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req); if (err == -ENOENT) { dentry = ceph_handle_snapdir(req, dentry); @@ -858,6 +883,13 @@ retry: dout("atomic_open finish_no_open on dn %p\n", dn); err = finish_no_open(file, dn); } else { + if (IS_ENCRYPTED(dir) && + !fscrypt_has_permitted_context(dir, d_inode(dentry))) { + pr_warn("Inconsistent encryption context (parent %llx:%llx child %llx:%llx)\n", + ceph_vinop(dir), ceph_vinop(d_inode(dentry))); + goto out_req; + } + dout("atomic_open finish_open on dn %p\n", dn); if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { struct inode *newino = d_inode(dentry); @@ -870,6 +902,7 @@ retry: } out_req: ceph_mdsc_put_request(req); + iput(new_inode); out_ctx: ceph_release_acl_sec_ctx(&as_ctx); dout("atomic_open result=%d\n", err); @@ -924,21 +957,24 @@ enum { * If we get a short result from the OSD, check against i_size; we need to * only return a short read to the caller if we hit EOF. */ -static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, - int *retry_op) +ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, + struct iov_iter *to, int *retry_op, + u64 *last_objver) { - struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_client *osdc = &fsc->client->osdc; ssize_t ret; - u64 off = iocb->ki_pos; + u64 off = *ki_pos; u64 len = iov_iter_count(to); u64 i_size = i_size_read(inode); + bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD); + u64 objver = 0; - dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, - (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); + dout("sync_read on inode %p %llx~%llx\n", inode, *ki_pos, len); + + if (ceph_inode_is_shutdown(inode)) + return -EIO; if (!len) return 0; @@ -962,10 +998,21 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, bool more; int idx; size_t left; + struct ceph_osd_req_op *op; + u64 read_off = off; + u64 read_len = len; + + /* determine new offset/length if encrypted */ + ceph_fscrypt_adjust_off_and_len(inode, &read_off, &read_len); + + dout("sync_read orig %llu~%llu reading %llu~%llu", + off, len, read_off, read_len); req = ceph_osdc_new_request(osdc, &ci->i_layout, - ci->i_vino, off, &len, 0, 1, - CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, + ci->i_vino, read_off, &read_len, 0, 1, + sparse ? CEPH_OSD_OP_SPARSE_READ : + CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq, ci->i_truncate_size, false); if (IS_ERR(req)) { @@ -973,10 +1020,13 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, break; } + /* adjust len downward if the request truncated the len */ + if (off + len > read_off + read_len) + len = read_off + read_len - off; more = len < iov_iter_count(to); - num_pages = calc_pages_for(off, len); - page_off = off & ~PAGE_MASK; + num_pages = calc_pages_for(read_off, read_len); + page_off = offset_in_page(off); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); if (IS_ERR(pages)) { ceph_osdc_put_request(req); @@ -984,29 +1034,75 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, break; } - osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off, + osd_req_op_extent_osd_data_pages(req, 0, pages, read_len, + offset_in_page(read_off), false, false); + + op = &req->r_ops[0]; + if (sparse) { + ret = ceph_alloc_sparse_ext_map(op); + if (ret) { + ceph_osdc_put_request(req); + break; + } + } + ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, - len, ret); + read_len, ret); - ceph_osdc_put_request(req); + if (ret > 0) + objver = req->r_version; i_size = i_size_read(inode); dout("sync_read %llu~%llu got %zd i_size %llu%s\n", off, len, ret, i_size, (more ? " MORE" : "")); - if (ret == -ENOENT) + /* Fix it to go to end of extent map */ + if (sparse && ret >= 0) + ret = ceph_sparse_ext_map_end(op); + else if (ret == -ENOENT) ret = 0; + + if (ret > 0 && IS_ENCRYPTED(inode)) { + int fret; + + fret = ceph_fscrypt_decrypt_extents(inode, pages, + read_off, op->extent.sparse_ext, + op->extent.sparse_ext_cnt); + if (fret < 0) { + ret = fret; + ceph_osdc_put_request(req); + break; + } + + /* account for any partial block at the beginning */ + fret -= (off - read_off); + + /* + * Short read after big offset adjustment? + * Nothing is usable, just call it a zero + * len read. + */ + fret = max(fret, 0); + + /* account for partial block at the end */ + ret = min_t(ssize_t, fret, len); + } + + ceph_osdc_put_request(req); + + /* Short read but not EOF? Zero out the remainder. */ if (ret >= 0 && ret < len && (off + ret < i_size)) { int zlen = min(len - ret, i_size - off - ret); int zoff = page_off + ret; + dout("sync_read zero gap %llu~%llu\n", - off + ret, off + ret + zlen); + off + ret, off + ret + zlen); ceph_zero_page_vector_range(zoff, zlen, pages); ret += zlen; } @@ -1014,15 +1110,16 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, idx = 0; left = ret > 0 ? ret : 0; while (left > 0) { - size_t len, copied; - page_off = off & ~PAGE_MASK; - len = min_t(size_t, left, PAGE_SIZE - page_off); + size_t plen, copied; + + plen = min_t(size_t, left, PAGE_SIZE - page_off); SetPageUptodate(pages[idx]); copied = copy_page_to_iter(pages[idx++], - page_off, len, to); + page_off, plen, to); off += copied; left -= copied; - if (copied < len) { + page_off = 0; + if (copied < plen) { ret = -EFAULT; break; } @@ -1039,21 +1136,37 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, break; } - if (off > iocb->ki_pos) { - if (off >= i_size) { - *retry_op = CHECK_EOF; - ret = i_size - iocb->ki_pos; - iocb->ki_pos = i_size; - } else { - ret = off - iocb->ki_pos; - iocb->ki_pos = off; + if (ret > 0) { + if (off > *ki_pos) { + if (off >= i_size) { + *retry_op = CHECK_EOF; + ret = i_size - *ki_pos; + *ki_pos = i_size; + } else { + ret = off - *ki_pos; + *ki_pos = off; + } } - } + if (last_objver) + *last_objver = objver; + } dout("sync_read result %zd retry_op %d\n", ret, *retry_op); return ret; } +static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, + int *retry_op) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + + dout("sync_read on file %p %llx~%zx %s\n", file, iocb->ki_pos, + iov_iter_count(to), (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); + + return __ceph_sync_read(inode, &iocb->ki_pos, to, retry_op, NULL); +} + struct ceph_aio_request { struct kiocb *iocb; size_t total_len; @@ -1125,8 +1238,10 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) struct inode *inode = req->r_inode; struct ceph_aio_request *aio_req = req->r_priv; struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); + struct ceph_osd_req_op *op = &req->r_ops[0]; struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric; unsigned int len = osd_data->bvec_pos.iter.bi_size; + bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ); BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS); BUG_ON(!osd_data->num_bvecs); @@ -1147,6 +1262,8 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) } rc = -ENOMEM; } else if (!aio_req->write) { + if (sparse && rc >= 0) + rc = ceph_sparse_ext_map_end(op); if (rc == -ENOENT) rc = 0; if (rc >= 0 && len > rc) { @@ -1283,6 +1400,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, loff_t pos = iocb->ki_pos; bool write = iov_iter_rw(iter) == WRITE; bool should_dirty = !write && user_backed_iter(iter); + bool sparse = ceph_test_mount_opt(fsc, SPARSEREAD); if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; @@ -1310,6 +1428,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, while (iov_iter_count(iter) > 0) { u64 size = iov_iter_count(iter); ssize_t len; + struct ceph_osd_req_op *op; + int readop = sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ; if (write) size = min_t(u64, size, fsc->mount_options->wsize); @@ -1320,8 +1440,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, pos, &size, 0, 1, - write ? CEPH_OSD_OP_WRITE : - CEPH_OSD_OP_READ, + write ? CEPH_OSD_OP_WRITE : readop, flags, snapc, ci->i_truncate_seq, ci->i_truncate_size, @@ -1372,6 +1491,14 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, } osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); + op = &req->r_ops[0]; + if (sparse) { + ret = ceph_alloc_sparse_ext_map(op); + if (ret) { + ceph_osdc_put_request(req); + break; + } + } if (aio_req) { aio_req->total_len += len; @@ -1399,8 +1526,11 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, size = i_size_read(inode); if (!write) { - if (ret == -ENOENT) + if (sparse && ret >= 0) + ret = ceph_sparse_ext_map_end(op); + else if (ret == -ENOENT) ret = 0; + if (ret >= 0 && ret < len && pos + ret < size) { struct iov_iter i; int zlen = min_t(size_t, len - ret, @@ -1481,13 +1611,12 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_vino vino; + struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_osd_request *req; struct page **pages; u64 len; int num_pages; int written = 0; - int flags; int ret; bool check_caps = false; struct timespec64 mtime = current_time(inode); @@ -1505,79 +1634,350 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, return ret; ceph_fscache_invalidate(inode, false); - ret = invalidate_inode_pages2_range(inode->i_mapping, - pos >> PAGE_SHIFT, - (pos + count - 1) >> PAGE_SHIFT); - if (ret < 0) - dout("invalidate_inode_pages2_range returned %d\n", ret); - - flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; while ((len = iov_iter_count(from)) > 0) { size_t left; int n; - - vino = ceph_vino(inode); - req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, pos, &len, 0, 1, - CEPH_OSD_OP_WRITE, flags, snapc, - ci->i_truncate_seq, - ci->i_truncate_size, - false); - if (IS_ERR(req)) { - ret = PTR_ERR(req); - break; - } + u64 write_pos = pos; + u64 write_len = len; + u64 objnum, objoff; + u32 xlen; + u64 assert_ver = 0; + bool rmw; + bool first, last; + struct iov_iter saved_iter = *from; + size_t off; + + ceph_fscrypt_adjust_off_and_len(inode, &write_pos, &write_len); + + /* clamp the length to the end of first object */ + ceph_calc_file_object_mapping(&ci->i_layout, write_pos, + write_len, &objnum, &objoff, + &xlen); + write_len = xlen; + + /* adjust len downward if it goes beyond current object */ + if (pos + len > write_pos + write_len) + len = write_pos + write_len - pos; /* - * write from beginning of first page, - * regardless of io alignment + * If we had to adjust the length or position to align with a + * crypto block, then we must do a read/modify/write cycle. We + * use a version assertion to redrive the thing if something + * changes in between. */ - num_pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + first = pos != write_pos; + last = (pos + len) != (write_pos + write_len); + rmw = first || last; + + dout("sync_write ino %llx %lld~%llu adjusted %lld~%llu -- %srmw\n", + ci->i_vino.ino, pos, len, write_pos, write_len, + rmw ? "" : "no "); + /* + * The data is emplaced into the page as it would be if it were + * in an array of pagecache pages. + */ + num_pages = calc_pages_for(write_pos, write_len); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); if (IS_ERR(pages)) { ret = PTR_ERR(pages); - goto out; + break; + } + + /* Do we need to preload the pages? */ + if (rmw) { + u64 first_pos = write_pos; + u64 last_pos = (write_pos + write_len) - CEPH_FSCRYPT_BLOCK_SIZE; + u64 read_len = CEPH_FSCRYPT_BLOCK_SIZE; + struct ceph_osd_req_op *op; + + /* We should only need to do this for encrypted inodes */ + WARN_ON_ONCE(!IS_ENCRYPTED(inode)); + + /* No need to do two reads if first and last blocks are same */ + if (first && last_pos == first_pos) + last = false; + + /* + * Allocate a read request for one or two extents, + * depending on how the request was aligned. + */ + req = ceph_osdc_new_request(osdc, &ci->i_layout, + ci->i_vino, first ? first_pos : last_pos, + &read_len, 0, (first && last) ? 2 : 1, + CEPH_OSD_OP_SPARSE_READ, CEPH_OSD_FLAG_READ, + NULL, ci->i_truncate_seq, + ci->i_truncate_size, false); + if (IS_ERR(req)) { + ceph_release_page_vector(pages, num_pages); + ret = PTR_ERR(req); + break; + } + + /* Something is misaligned! */ + if (read_len != CEPH_FSCRYPT_BLOCK_SIZE) { + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + ret = -EIO; + break; + } + + /* Add extent for first block? */ + op = &req->r_ops[0]; + + if (first) { + osd_req_op_extent_osd_data_pages(req, 0, pages, + CEPH_FSCRYPT_BLOCK_SIZE, + offset_in_page(first_pos), + false, false); + /* We only expect a single extent here */ + ret = __ceph_alloc_sparse_ext_map(op, 1); + if (ret) { + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + break; + } + } + + /* Add extent for last block */ + if (last) { + /* Init the other extent if first extent has been used */ + if (first) { + op = &req->r_ops[1]; + osd_req_op_extent_init(req, 1, + CEPH_OSD_OP_SPARSE_READ, + last_pos, CEPH_FSCRYPT_BLOCK_SIZE, + ci->i_truncate_size, + ci->i_truncate_seq); + } + + ret = __ceph_alloc_sparse_ext_map(op, 1); + if (ret) { + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + break; + } + + osd_req_op_extent_osd_data_pages(req, first ? 1 : 0, + &pages[num_pages - 1], + CEPH_FSCRYPT_BLOCK_SIZE, + offset_in_page(last_pos), + false, false); + } + + ceph_osdc_start_request(osdc, req); + ret = ceph_osdc_wait_request(osdc, req); + + /* FIXME: length field is wrong if there are 2 extents */ + ceph_update_read_metrics(&fsc->mdsc->metric, + req->r_start_latency, + req->r_end_latency, + read_len, ret); + + /* Ok if object is not already present */ + if (ret == -ENOENT) { + /* + * If there is no object, then we can't assert + * on its version. Set it to 0, and we'll use an + * exclusive create instead. + */ + ceph_osdc_put_request(req); + ret = 0; + + /* + * zero out the soon-to-be uncopied parts of the + * first and last pages. + */ + if (first) + zero_user_segment(pages[0], 0, + offset_in_page(first_pos)); + if (last) + zero_user_segment(pages[num_pages - 1], + offset_in_page(last_pos), + PAGE_SIZE); + } else { + if (ret < 0) { + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + break; + } + + op = &req->r_ops[0]; + if (op->extent.sparse_ext_cnt == 0) { + if (first) + zero_user_segment(pages[0], 0, + offset_in_page(first_pos)); + else + zero_user_segment(pages[num_pages - 1], + offset_in_page(last_pos), + PAGE_SIZE); + } else if (op->extent.sparse_ext_cnt != 1 || + ceph_sparse_ext_map_end(op) != + CEPH_FSCRYPT_BLOCK_SIZE) { + ret = -EIO; + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + break; + } + + if (first && last) { + op = &req->r_ops[1]; + if (op->extent.sparse_ext_cnt == 0) { + zero_user_segment(pages[num_pages - 1], + offset_in_page(last_pos), + PAGE_SIZE); + } else if (op->extent.sparse_ext_cnt != 1 || + ceph_sparse_ext_map_end(op) != + CEPH_FSCRYPT_BLOCK_SIZE) { + ret = -EIO; + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + break; + } + } + + /* Grab assert version. It must be non-zero. */ + assert_ver = req->r_version; + WARN_ON_ONCE(ret > 0 && assert_ver == 0); + + ceph_osdc_put_request(req); + if (first) { + ret = ceph_fscrypt_decrypt_block_inplace(inode, + pages[0], CEPH_FSCRYPT_BLOCK_SIZE, + offset_in_page(first_pos), + first_pos >> CEPH_FSCRYPT_BLOCK_SHIFT); + if (ret < 0) { + ceph_release_page_vector(pages, num_pages); + break; + } + } + if (last) { + ret = ceph_fscrypt_decrypt_block_inplace(inode, + pages[num_pages - 1], + CEPH_FSCRYPT_BLOCK_SIZE, + offset_in_page(last_pos), + last_pos >> CEPH_FSCRYPT_BLOCK_SHIFT); + if (ret < 0) { + ceph_release_page_vector(pages, num_pages); + break; + } + } + } } left = len; + off = offset_in_page(pos); for (n = 0; n < num_pages; n++) { - size_t plen = min_t(size_t, left, PAGE_SIZE); - ret = copy_page_from_iter(pages[n], 0, plen, from); + size_t plen = min_t(size_t, left, PAGE_SIZE - off); + + /* copy the data */ + ret = copy_page_from_iter(pages[n], off, plen, from); if (ret != plen) { ret = -EFAULT; break; } + off = 0; left -= ret; } - if (ret < 0) { + dout("sync_write write failed with %d\n", ret); ceph_release_page_vector(pages, num_pages); - goto out; + break; } - req->r_inode = inode; + if (IS_ENCRYPTED(inode)) { + ret = ceph_fscrypt_encrypt_pages(inode, pages, + write_pos, write_len, + GFP_KERNEL); + if (ret < 0) { + dout("encryption failed with %d\n", ret); + ceph_release_page_vector(pages, num_pages); + break; + } + } - osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, - false, true); + req = ceph_osdc_new_request(osdc, &ci->i_layout, + ci->i_vino, write_pos, &write_len, + rmw ? 1 : 0, rmw ? 2 : 1, + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE, + snapc, ci->i_truncate_seq, + ci->i_truncate_size, false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + ceph_release_page_vector(pages, num_pages); + break; + } + dout("sync_write write op %lld~%llu\n", write_pos, write_len); + osd_req_op_extent_osd_data_pages(req, rmw ? 1 : 0, pages, write_len, + offset_in_page(write_pos), false, + true); + req->r_inode = inode; req->r_mtime = mtime; - ceph_osdc_start_request(&fsc->client->osdc, req); - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + + /* Set up the assertion */ + if (rmw) { + /* + * Set up the assertion. If we don't have a version + * number, then the object doesn't exist yet. Use an + * exclusive create instead of a version assertion in + * that case. + */ + if (assert_ver) { + osd_req_op_init(req, 0, CEPH_OSD_OP_ASSERT_VER, 0); + req->r_ops[0].assert_ver.ver = assert_ver; + } else { + osd_req_op_init(req, 0, CEPH_OSD_OP_CREATE, + CEPH_OSD_OP_FLAG_EXCL); + } + } + + ceph_osdc_start_request(osdc, req); + ret = ceph_osdc_wait_request(osdc, req); ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, ret); -out: ceph_osdc_put_request(req); if (ret != 0) { + dout("sync_write osd write returned %d\n", ret); + /* Version changed! Must re-do the rmw cycle */ + if ((assert_ver && (ret == -ERANGE || ret == -EOVERFLOW)) || + (!assert_ver && ret == -EEXIST)) { + /* We should only ever see this on a rmw */ + WARN_ON_ONCE(!rmw); + + /* The version should never go backward */ + WARN_ON_ONCE(ret == -EOVERFLOW); + + *from = saved_iter; + + /* FIXME: limit number of times we loop? */ + continue; + } ceph_set_error_write(ci); break; } ceph_clear_error_write(ci); + + /* + * We successfully wrote to a range of the file. Declare + * that region of the pagecache invalid. + */ + ret = invalidate_inode_pages2_range( + inode->i_mapping, + pos >> PAGE_SHIFT, + (pos + len - 1) >> PAGE_SHIFT); + if (ret < 0) { + dout("invalidate_inode_pages2_range returned %d\n", + ret); + ret = 0; + } pos += len; written += len; + dout("sync_write written %d\n", written); if (pos > i_size_read(inode)) { check_caps = ceph_inode_set_size(inode, pos); if (check_caps) @@ -1591,6 +1991,7 @@ out: ret = written; iocb->ki_pos = pos; } + dout("sync_write returning %d\n", ret); return ret; } @@ -1648,7 +2049,9 @@ again: ceph_cap_string(got)); if (!ceph_has_inline_data(ci)) { - if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { + if (!retry_op && + (iocb->ki_flags & IOCB_DIRECT) && + !IS_ENCRYPTED(inode)) { ret = ceph_direct_read_write(iocb, to, NULL, NULL); if (ret >= 0 && ret < len) @@ -1934,7 +2337,7 @@ retry_snap: /* we might need to revert back to that point */ data = *from; - if (iocb->ki_flags & IOCB_DIRECT) + if ((iocb->ki_flags & IOCB_DIRECT) && !IS_ENCRYPTED(inode)) written = ceph_direct_read_write(iocb, &data, snapc, &prealloc_cf); else @@ -2165,6 +2568,9 @@ static long ceph_fallocate(struct file *file, int mode, if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; + if (IS_ENCRYPTED(inode)) + return -EOPNOTSUPP; + prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) return -ENOMEM; @@ -2486,6 +2892,10 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, return -EOPNOTSUPP; } + /* Every encrypted inode gets its own key, so we can't offload them */ + if (IS_ENCRYPTED(src_inode) || IS_ENCRYPTED(dst_inode)) + return -EOPNOTSUPP; + if (len < src_ci->i_layout.object_size) return -EOPNOTSUPP; /* no remote copy will be done */ diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 8e5f41d45283..800ab7920513 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -14,10 +14,12 @@ #include <linux/random.h> #include <linux/sort.h> #include <linux/iversion.h> +#include <linux/fscrypt.h> #include "super.h" #include "mds_client.h" #include "cache.h" +#include "crypto.h" #include <linux/ceph/decode.h> /* @@ -33,6 +35,7 @@ */ static const struct inode_operations ceph_symlink_iops; +static const struct inode_operations ceph_encrypted_symlink_iops; static void ceph_inode_work(struct work_struct *work); @@ -52,17 +55,99 @@ static int ceph_set_ino_cb(struct inode *inode, void *data) return 0; } -struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) +/** + * ceph_new_inode - allocate a new inode in advance of an expected create + * @dir: parent directory for new inode + * @dentry: dentry that may eventually point to new inode + * @mode: mode of new inode + * @as_ctx: pointer to inherited security context + * + * Allocate a new inode in advance of an operation to create a new inode. + * This allocates the inode and sets up the acl_sec_ctx with appropriate + * info for the new inode. + * + * Returns a pointer to the new inode or an ERR_PTR. + */ +struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry, + umode_t *mode, struct ceph_acl_sec_ctx *as_ctx) +{ + int err; + struct inode *inode; + + inode = new_inode(dir->i_sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (!S_ISLNK(*mode)) { + err = ceph_pre_init_acls(dir, mode, as_ctx); + if (err < 0) + goto out_err; + } + + inode->i_state = 0; + inode->i_mode = *mode; + + err = ceph_security_init_secctx(dentry, *mode, as_ctx); + if (err < 0) + goto out_err; + + /* + * We'll skip setting fscrypt context for snapshots, leaving that for + * the handle_reply(). + */ + if (ceph_snap(dir) != CEPH_SNAPDIR) { + err = ceph_fscrypt_prepare_context(dir, inode, as_ctx); + if (err) + goto out_err; + } + + return inode; +out_err: + iput(inode); + return ERR_PTR(err); +} + +void ceph_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as_ctx) +{ + if (as_ctx->pagelist) { + req->r_pagelist = as_ctx->pagelist; + as_ctx->pagelist = NULL; + } + ceph_fscrypt_as_ctx_to_req(req, as_ctx); +} + +/** + * ceph_get_inode - find or create/hash a new inode + * @sb: superblock to search and allocate in + * @vino: vino to search for + * @newino: optional new inode to insert if one isn't found (may be NULL) + * + * Search for or insert a new inode into the hash for the given vino, and + * return a reference to it. If new is non-NULL, its reference is consumed. + */ +struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino, + struct inode *newino) { struct inode *inode; if (ceph_vino_is_reserved(vino)) return ERR_PTR(-EREMOTEIO); - inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare, - ceph_set_ino_cb, &vino); - if (!inode) + if (newino) { + inode = inode_insert5(newino, (unsigned long)vino.ino, + ceph_ino_compare, ceph_set_ino_cb, &vino); + if (inode != newino) + iput(newino); + } else { + inode = iget5_locked(sb, (unsigned long)vino.ino, + ceph_ino_compare, ceph_set_ino_cb, &vino); + } + + if (!inode) { + dout("No inode found for %llx.%llx\n", vino.ino, vino.snap); return ERR_PTR(-ENOMEM); + } dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode), ceph_vinop(inode), inode, !!(inode->i_state & I_NEW)); @@ -78,8 +163,9 @@ struct inode *ceph_get_snapdir(struct inode *parent) .ino = ceph_ino(parent), .snap = CEPH_SNAPDIR, }; - struct inode *inode = ceph_get_inode(parent->i_sb, vino); + struct inode *inode = ceph_get_inode(parent->i_sb, vino, NULL); struct ceph_inode_info *ci = ceph_inode(inode); + int ret = -ENOTDIR; if (IS_ERR(inode)) return inode; @@ -100,11 +186,29 @@ struct inode *ceph_get_snapdir(struct inode *parent) inode->i_uid = parent->i_uid; inode->i_gid = parent->i_gid; inode->i_mtime = parent->i_mtime; - inode->i_ctime = parent->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(parent)); inode->i_atime = parent->i_atime; ci->i_rbytes = 0; ci->i_btime = ceph_inode(parent)->i_btime; +#ifdef CONFIG_FS_ENCRYPTION + /* if encrypted, just borrow fscrypt_auth from parent */ + if (IS_ENCRYPTED(parent)) { + struct ceph_inode_info *pci = ceph_inode(parent); + + ci->fscrypt_auth = kmemdup(pci->fscrypt_auth, + pci->fscrypt_auth_len, + GFP_KERNEL); + if (ci->fscrypt_auth) { + inode->i_flags |= S_ENCRYPTED; + ci->fscrypt_auth_len = pci->fscrypt_auth_len; + } else { + dout("Failed to alloc snapdir fscrypt_auth\n"); + ret = -ENOMEM; + goto err; + } + } +#endif if (inode->i_state & I_NEW) { inode->i_op = &ceph_snapdir_iops; inode->i_fop = &ceph_snapdir_fops; @@ -118,7 +222,7 @@ err: discard_new_inode(inode); else iput(inode); - return ERR_PTR(-ENOTDIR); + return ERR_PTR(ret); } const struct inode_operations ceph_file_iops = { @@ -517,6 +621,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_truncate_seq = 0; ci->i_truncate_size = 0; ci->i_truncate_pending = 0; + ci->i_truncate_pagecache_size = 0; ci->i_max_size = 0; ci->i_reported_size = 0; @@ -547,6 +652,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb) INIT_WORK(&ci->i_work, ceph_inode_work); ci->i_work_mask = 0; memset(&ci->i_btime, '\0', sizeof(ci->i_btime)); +#ifdef CONFIG_FS_ENCRYPTION + ci->fscrypt_auth = NULL; + ci->fscrypt_auth_len = 0; +#endif return &ci->netfs.inode; } @@ -555,6 +664,10 @@ void ceph_free_inode(struct inode *inode) struct ceph_inode_info *ci = ceph_inode(inode); kfree(ci->i_symlink); +#ifdef CONFIG_FS_ENCRYPTION + kfree(ci->fscrypt_auth); +#endif + fscrypt_free_inode(inode); kmem_cache_free(ceph_inode_cachep, ci); } @@ -575,6 +688,7 @@ void ceph_evict_inode(struct inode *inode) clear_inode(inode); ceph_fscache_unregister_inode_cookie(ci); + fscrypt_put_encryption_info(inode); __ceph_remove_caps(ci); @@ -650,7 +764,7 @@ int ceph_fill_file_size(struct inode *inode, int issued, ceph_fscache_update(inode); ci->i_reported_size = size; if (truncate_seq != ci->i_truncate_seq) { - dout("truncate_seq %u -> %u\n", + dout("%s truncate_seq %u -> %u\n", __func__, ci->i_truncate_seq, truncate_seq); ci->i_truncate_seq = truncate_seq; @@ -674,11 +788,26 @@ int ceph_fill_file_size(struct inode *inode, int issued, } } } - if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 && - ci->i_truncate_size != truncate_size) { - dout("truncate_size %lld -> %llu\n", ci->i_truncate_size, - truncate_size); + + /* + * It's possible that the new sizes of the two consecutive + * size truncations will be in the same fscrypt last block, + * and we need to truncate the corresponding page caches + * anyway. + */ + if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0) { + dout("%s truncate_size %lld -> %llu, encrypted %d\n", __func__, + ci->i_truncate_size, truncate_size, !!IS_ENCRYPTED(inode)); + ci->i_truncate_size = truncate_size; + + if (IS_ENCRYPTED(inode)) { + dout("%s truncate_pagecache_size %lld -> %llu\n", + __func__, ci->i_truncate_pagecache_size, size); + ci->i_truncate_pagecache_size = size; + } else { + ci->i_truncate_pagecache_size = truncate_size; + } } return queue_trunc; } @@ -688,6 +817,7 @@ void ceph_fill_file_time(struct inode *inode, int issued, struct timespec64 *mtime, struct timespec64 *atime) { struct ceph_inode_info *ci = ceph_inode(inode); + struct timespec64 ictime = inode_get_ctime(inode); int warn = 0; if (issued & (CEPH_CAP_FILE_EXCL| @@ -696,11 +826,11 @@ void ceph_fill_file_time(struct inode *inode, int issued, CEPH_CAP_AUTH_EXCL| CEPH_CAP_XATTR_EXCL)) { if (ci->i_version == 0 || - timespec64_compare(ctime, &inode->i_ctime) > 0) { + timespec64_compare(ctime, &ictime) > 0) { dout("ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n", - inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, + ictime.tv_sec, ictime.tv_nsec, ctime->tv_sec, ctime->tv_nsec); - inode->i_ctime = *ctime; + inode_set_ctime_to_ts(inode, *ctime); } if (ci->i_version == 0 || ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) { @@ -738,7 +868,7 @@ void ceph_fill_file_time(struct inode *inode, int issued, } else { /* we have no write|excl caps; whatever the MDS says is true */ if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { - inode->i_ctime = *ctime; + inode_set_ctime_to_ts(inode, *ctime); inode->i_mtime = *mtime; inode->i_atime = *atime; ci->i_time_warp_seq = time_warp_seq; @@ -751,6 +881,34 @@ void ceph_fill_file_time(struct inode *inode, int issued, inode, time_warp_seq, ci->i_time_warp_seq); } +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +static int decode_encrypted_symlink(const char *encsym, int enclen, u8 **decsym) +{ + int declen; + u8 *sym; + + sym = kmalloc(enclen + 1, GFP_NOFS); + if (!sym) + return -ENOMEM; + + declen = ceph_base64_decode(encsym, enclen, sym); + if (declen < 0) { + pr_err("%s: can't decode symlink (%d). Content: %.*s\n", + __func__, declen, enclen, encsym); + kfree(sym); + return -EIO; + } + sym[declen + 1] = '\0'; + *decsym = sym; + return declen; +} +#else +static int decode_encrypted_symlink(const char *encsym, int symlen, u8 **decsym) +{ + return -EOPNOTSUPP; +} +#endif + /* * Populate an inode based on info from mds. May be called on new or * existing inodes. @@ -856,15 +1014,20 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, issued |= __ceph_caps_dirty(ci); new_issued = ~issued & info_caps; - /* directories have fl_stripe_unit set to zero */ - if (le32_to_cpu(info->layout.fl_stripe_unit)) - inode->i_blkbits = - fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; - else - inode->i_blkbits = CEPH_BLOCK_SHIFT; - __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); +#ifdef CONFIG_FS_ENCRYPTION + if (iinfo->fscrypt_auth_len && + ((inode->i_state & I_NEW) || (ci->fscrypt_auth_len == 0))) { + kfree(ci->fscrypt_auth); + ci->fscrypt_auth_len = iinfo->fscrypt_auth_len; + ci->fscrypt_auth = iinfo->fscrypt_auth; + iinfo->fscrypt_auth = NULL; + iinfo->fscrypt_auth_len = 0; + inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED); + } +#endif + if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && (issued & CEPH_CAP_AUTH_EXCL) == 0) { inode->i_mode = mode; @@ -877,6 +1040,15 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime); } + /* directories have fl_stripe_unit set to zero */ + if (IS_ENCRYPTED(inode)) + inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT; + else if (le32_to_cpu(info->layout.fl_stripe_unit)) + inode->i_blkbits = + fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + else + inode->i_blkbits = CEPH_BLOCK_SHIFT; + if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) && (issued & CEPH_CAP_LINK_EXCL) == 0) set_nlink(inode, le32_to_cpu(info->nlink)); @@ -898,6 +1070,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, if (new_version || (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { + u64 size = le64_to_cpu(info->size); s64 old_pool = ci->i_layout.pool_id; struct ceph_string *old_ns; @@ -911,10 +1084,22 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, pool_ns = old_ns; + if (IS_ENCRYPTED(inode) && size && + iinfo->fscrypt_file_len == sizeof(__le64)) { + u64 fsize = __le64_to_cpu(*(__le64 *)iinfo->fscrypt_file); + + if (size == round_up(fsize, CEPH_FSCRYPT_BLOCK_SIZE)) { + size = fsize; + } else { + pr_warn("fscrypt size mismatch: size=%llu fscrypt_file=%llu, discarding fscrypt_file size.\n", + info->size, size); + } + } + queue_trunc = ceph_fill_file_size(inode, issued, le32_to_cpu(info->truncate_seq), le64_to_cpu(info->truncate_size), - le64_to_cpu(info->size)); + size); /* only update max_size on auth cap */ if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && ci->i_max_size != le64_to_cpu(info->max_size)) { @@ -974,26 +1159,42 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, inode->i_fop = &ceph_file_fops; break; case S_IFLNK: - inode->i_op = &ceph_symlink_iops; if (!ci->i_symlink) { u32 symlen = iinfo->symlink_len; char *sym; spin_unlock(&ci->i_ceph_lock); - if (symlen != i_size_read(inode)) { - pr_err("%s %llx.%llx BAD symlink " - "size %lld\n", __func__, - ceph_vinop(inode), - i_size_read(inode)); + if (IS_ENCRYPTED(inode)) { + if (symlen != i_size_read(inode)) + pr_err("%s %llx.%llx BAD symlink size %lld\n", + __func__, ceph_vinop(inode), + i_size_read(inode)); + + err = decode_encrypted_symlink(iinfo->symlink, + symlen, (u8 **)&sym); + if (err < 0) { + pr_err("%s decoding encrypted symlink failed: %d\n", + __func__, err); + goto out; + } + symlen = err; i_size_write(inode, symlen); inode->i_blocks = calc_inode_blocks(symlen); - } + } else { + if (symlen != i_size_read(inode)) { + pr_err("%s %llx.%llx BAD symlink size %lld\n", + __func__, ceph_vinop(inode), + i_size_read(inode)); + i_size_write(inode, symlen); + inode->i_blocks = calc_inode_blocks(symlen); + } - err = -ENOMEM; - sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); - if (!sym) - goto out; + err = -ENOMEM; + sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); + if (!sym) + goto out; + } spin_lock(&ci->i_ceph_lock); if (!ci->i_symlink) @@ -1001,7 +1202,17 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, else kfree(sym); /* lost a race */ } - inode->i_link = ci->i_symlink; + + if (IS_ENCRYPTED(inode)) { + /* + * Encrypted symlinks need to be decrypted before we can + * cache their targets in i_link. Don't touch it here. + */ + inode->i_op = &ceph_encrypted_symlink_iops; + } else { + inode->i_link = ci->i_symlink; + inode->i_op = &ceph_symlink_iops; + } break; case S_IFDIR: inode->i_op = &ceph_dir_iops; @@ -1309,8 +1520,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME && test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { + bool is_nokey = false; struct qstr dname; struct dentry *dn, *parent; + struct fscrypt_str oname = FSTR_INIT(NULL, 0); + struct ceph_fname fname = { .dir = dir, + .name = rinfo->dname, + .ctext = rinfo->altname, + .name_len = rinfo->dname_len, + .ctext_len = rinfo->altname_len }; BUG_ON(!rinfo->head->is_target); BUG_ON(req->r_dentry); @@ -1318,8 +1536,20 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) parent = d_find_any_alias(dir); BUG_ON(!parent); - dname.name = rinfo->dname; - dname.len = rinfo->dname_len; + err = ceph_fname_alloc_buffer(dir, &oname); + if (err < 0) { + dput(parent); + goto done; + } + + err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey); + if (err < 0) { + dput(parent); + ceph_fname_free_buffer(dir, &oname); + goto done; + } + dname.name = oname.name; + dname.len = oname.len; dname.hash = full_name_hash(parent, dname.name, dname.len); tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); @@ -1334,9 +1564,15 @@ retry_lookup: dname.len, dname.name, dn); if (!dn) { dput(parent); + ceph_fname_free_buffer(dir, &oname); err = -ENOMEM; goto done; } + if (is_nokey) { + spin_lock(&dn->d_lock); + dn->d_flags |= DCACHE_NOKEY_NAME; + spin_unlock(&dn->d_lock); + } err = 0; } else if (d_really_is_positive(dn) && (ceph_ino(d_inode(dn)) != tvino.ino || @@ -1348,6 +1584,7 @@ retry_lookup: dput(dn); goto retry_lookup; } + ceph_fname_free_buffer(dir, &oname); req->r_dentry = dn; dput(parent); @@ -1551,7 +1788,7 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, vino.ino = le64_to_cpu(rde->inode.in->ino); vino.snap = le64_to_cpu(rde->inode.in->snapid); - in = ceph_get_inode(req->r_dentry->d_sb, vino); + in = ceph_get_inode(req->r_dentry->d_sb, vino, NULL); if (IS_ERR(in)) { err = PTR_ERR(in); dout("new_inode badness got %d\n", err); @@ -1629,7 +1866,8 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session) { struct dentry *parent = req->r_dentry; - struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); + struct inode *inode = d_inode(parent); + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct qstr dname; struct dentry *dn; @@ -1703,9 +1941,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, tvino.snap = le64_to_cpu(rde->inode.in->snapid); if (rinfo->hash_order) { - u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, - rde->name, rde->name_len); - hash = ceph_frag_value(hash); + u32 hash = ceph_frag_value(rde->raw_hash); if (hash != last_hash) fpos_offset = 2; last_hash = hash; @@ -1728,6 +1964,11 @@ retry_lookup: err = -ENOMEM; goto out; } + if (rde->is_nokey) { + spin_lock(&dn->d_lock); + dn->d_flags |= DCACHE_NOKEY_NAME; + spin_unlock(&dn->d_lock); + } } else if (d_really_is_positive(dn) && (ceph_ino(d_inode(dn)) != tvino.ino || ceph_snap(d_inode(dn)) != tvino.snap)) { @@ -1753,7 +1994,7 @@ retry_lookup: if (d_really_is_positive(dn)) { in = d_inode(dn); } else { - in = ceph_get_inode(parent->d_sb, tvino); + in = ceph_get_inode(parent->d_sb, tvino, NULL); if (IS_ERR(in)) { dout("new_inode badness\n"); d_drop(dn); @@ -1926,7 +2167,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode) retry: spin_lock(&ci->i_ceph_lock); if (ci->i_truncate_pending == 0) { - dout("__do_pending_vmtruncate %p none pending\n", inode); + dout("%s %p none pending\n", __func__, inode); spin_unlock(&ci->i_ceph_lock); mutex_unlock(&ci->i_truncate_mutex); return; @@ -1938,8 +2179,7 @@ retry: */ if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { spin_unlock(&ci->i_ceph_lock); - dout("__do_pending_vmtruncate %p flushing snaps first\n", - inode); + dout("%s %p flushing snaps first\n", __func__, inode); filemap_write_and_wait_range(&inode->i_data, 0, inode->i_sb->s_maxbytes); goto retry; @@ -1948,9 +2188,9 @@ retry: /* there should be no reader or writer */ WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref); - to = ci->i_truncate_size; + to = ci->i_truncate_pagecache_size; wrbuffer_refs = ci->i_wrbuffer_ref; - dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, + dout("%s %p (%d) to %lld\n", __func__, inode, ci->i_truncate_pending, to); spin_unlock(&ci->i_ceph_lock); @@ -1958,7 +2198,7 @@ retry: truncate_pagecache(inode, to); spin_lock(&ci->i_ceph_lock); - if (to == ci->i_truncate_size) { + if (to == ci->i_truncate_pagecache_size) { ci->i_truncate_pending = 0; finish = 1; } @@ -1999,6 +2239,32 @@ static void ceph_inode_work(struct work_struct *work) iput(inode); } +static const char *ceph_encrypted_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + if (!dentry) + return ERR_PTR(-ECHILD); + + return fscrypt_get_symlink(inode, ci->i_symlink, i_size_read(inode), + done); +} + +static int ceph_encrypted_symlink_getattr(struct mnt_idmap *idmap, + const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + int ret; + + ret = ceph_getattr(idmap, path, stat, request_mask, query_flags); + if (ret) + return ret; + return fscrypt_symlink_getattr(path, stat); +} + /* * symlinks */ @@ -2009,20 +2275,173 @@ static const struct inode_operations ceph_symlink_iops = { .listxattr = ceph_listxattr, }; -int __ceph_setattr(struct inode *inode, struct iattr *attr) +static const struct inode_operations ceph_encrypted_symlink_iops = { + .get_link = ceph_encrypted_get_link, + .setattr = ceph_setattr, + .getattr = ceph_encrypted_symlink_getattr, + .listxattr = ceph_listxattr, +}; + +/* + * Transfer the encrypted last block to the MDS and the MDS + * will help update it when truncating a smaller size. + * + * We don't support a PAGE_SIZE that is smaller than the + * CEPH_FSCRYPT_BLOCK_SIZE. + */ +static int fill_fscrypt_truncate(struct inode *inode, + struct ceph_mds_request *req, + struct iattr *attr) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int boff = attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE; + loff_t pos, orig_pos = round_down(attr->ia_size, + CEPH_FSCRYPT_BLOCK_SIZE); + u64 block = orig_pos >> CEPH_FSCRYPT_BLOCK_SHIFT; + struct ceph_pagelist *pagelist = NULL; + struct kvec iov = {0}; + struct iov_iter iter; + struct page *page = NULL; + struct ceph_fscrypt_truncate_size_header header; + int retry_op = 0; + int len = CEPH_FSCRYPT_BLOCK_SIZE; + loff_t i_size = i_size_read(inode); + int got, ret, issued; + u64 objver; + + ret = __ceph_get_caps(inode, NULL, CEPH_CAP_FILE_RD, 0, -1, &got); + if (ret < 0) + return ret; + + issued = __ceph_caps_issued(ci, NULL); + + dout("%s size %lld -> %lld got cap refs on %s, issued %s\n", __func__, + i_size, attr->ia_size, ceph_cap_string(got), + ceph_cap_string(issued)); + + /* Try to writeback the dirty pagecaches */ + if (issued & (CEPH_CAP_FILE_BUFFER)) { + loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - 1; + + ret = filemap_write_and_wait_range(inode->i_mapping, + orig_pos, lend); + if (ret < 0) + goto out; + } + + page = __page_cache_alloc(GFP_KERNEL); + if (page == NULL) { + ret = -ENOMEM; + goto out; + } + + pagelist = ceph_pagelist_alloc(GFP_KERNEL); + if (!pagelist) { + ret = -ENOMEM; + goto out; + } + + iov.iov_base = kmap_local_page(page); + iov.iov_len = len; + iov_iter_kvec(&iter, READ, &iov, 1, len); + + pos = orig_pos; + ret = __ceph_sync_read(inode, &pos, &iter, &retry_op, &objver); + if (ret < 0) + goto out; + + /* Insert the header first */ + header.ver = 1; + header.compat = 1; + header.change_attr = cpu_to_le64(inode_peek_iversion_raw(inode)); + + /* + * Always set the block_size to CEPH_FSCRYPT_BLOCK_SIZE, + * because in MDS it may need this to do the truncate. + */ + header.block_size = cpu_to_le32(CEPH_FSCRYPT_BLOCK_SIZE); + + /* + * If we hit a hole here, we should just skip filling + * the fscrypt for the request, because once the fscrypt + * is enabled, the file will be split into many blocks + * with the size of CEPH_FSCRYPT_BLOCK_SIZE, if there + * has a hole, the hole size should be multiple of block + * size. + * + * If the Rados object doesn't exist, it will be set to 0. + */ + if (!objver) { + dout("%s hit hole, ppos %lld < size %lld\n", __func__, + pos, i_size); + + header.data_len = cpu_to_le32(8 + 8 + 4); + header.file_offset = 0; + ret = 0; + } else { + header.data_len = cpu_to_le32(8 + 8 + 4 + CEPH_FSCRYPT_BLOCK_SIZE); + header.file_offset = cpu_to_le64(orig_pos); + + dout("%s encrypt block boff/bsize %d/%lu\n", __func__, + boff, CEPH_FSCRYPT_BLOCK_SIZE); + + /* truncate and zero out the extra contents for the last block */ + memset(iov.iov_base + boff, 0, PAGE_SIZE - boff); + + /* encrypt the last block */ + ret = ceph_fscrypt_encrypt_block_inplace(inode, page, + CEPH_FSCRYPT_BLOCK_SIZE, + 0, block, + GFP_KERNEL); + if (ret) + goto out; + } + + /* Insert the header */ + ret = ceph_pagelist_append(pagelist, &header, sizeof(header)); + if (ret) + goto out; + + if (header.block_size) { + /* Append the last block contents to pagelist */ + ret = ceph_pagelist_append(pagelist, iov.iov_base, + CEPH_FSCRYPT_BLOCK_SIZE); + if (ret) + goto out; + } + req->r_pagelist = pagelist; +out: + dout("%s %p size dropping cap refs on %s\n", __func__, + inode, ceph_cap_string(got)); + ceph_put_cap_refs(ci, got); + if (iov.iov_base) + kunmap_local(iov.iov_base); + if (page) + __free_pages(page, 0); + if (ret && pagelist) + ceph_pagelist_release(pagelist); + return ret; +} + +int __ceph_setattr(struct inode *inode, struct iattr *attr, + struct ceph_iattr *cia) { struct ceph_inode_info *ci = ceph_inode(inode); unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_cap_flush *prealloc_cf; + loff_t isize = i_size_read(inode); int issued; int release = 0, dirtied = 0; int mask = 0; int err = 0; int inode_dirty_flags = 0; bool lock_snap_rwsem = false; + bool fill_fscrypt; + int truncate_retry = 20; /* The RMW will take around 50ms */ +retry: prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) return -ENOMEM; @@ -2034,6 +2453,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) return PTR_ERR(req); } + fill_fscrypt = false; spin_lock(&ci->i_ceph_lock); issued = __ceph_caps_issued(ci, NULL); @@ -2049,6 +2469,43 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) } dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + if (cia && cia->fscrypt_auth) { + u32 len = ceph_fscrypt_auth_len(cia->fscrypt_auth); + + if (len > sizeof(*cia->fscrypt_auth)) { + err = -EINVAL; + spin_unlock(&ci->i_ceph_lock); + goto out; + } + + dout("setattr %llx:%llx fscrypt_auth len %u to %u)\n", + ceph_vinop(inode), ci->fscrypt_auth_len, len); + + /* It should never be re-set once set */ + WARN_ON_ONCE(ci->fscrypt_auth); + + if (issued & CEPH_CAP_AUTH_EXCL) { + dirtied |= CEPH_CAP_AUTH_EXCL; + kfree(ci->fscrypt_auth); + ci->fscrypt_auth = (u8 *)cia->fscrypt_auth; + ci->fscrypt_auth_len = len; + } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || + ci->fscrypt_auth_len != len || + memcmp(ci->fscrypt_auth, cia->fscrypt_auth, len)) { + req->r_fscrypt_auth = cia->fscrypt_auth; + mask |= CEPH_SETATTR_FSCRYPT_AUTH; + release |= CEPH_CAP_AUTH_SHARED; + } + cia->fscrypt_auth = NULL; + } +#else + if (cia && cia->fscrypt_auth) { + err = -EINVAL; + spin_unlock(&ci->i_ceph_lock); + goto out; + } +#endif /* CONFIG_FS_ENCRYPTION */ if (ia_valid & ATTR_UID) { dout("setattr %p uid %d -> %d\n", inode, @@ -2118,10 +2575,27 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) } } if (ia_valid & ATTR_SIZE) { - loff_t isize = i_size_read(inode); - dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size); - if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) { + /* + * Only when the new size is smaller and not aligned to + * CEPH_FSCRYPT_BLOCK_SIZE will the RMW is needed. + */ + if (IS_ENCRYPTED(inode) && attr->ia_size < isize && + (attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE)) { + mask |= CEPH_SETATTR_SIZE; + release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; + set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); + mask |= CEPH_SETATTR_FSCRYPT_FILE; + req->r_args.setattr.size = + cpu_to_le64(round_up(attr->ia_size, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_args.setattr.old_size = + cpu_to_le64(round_up(isize, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_fscrypt_file = attr->ia_size; + fill_fscrypt = true; + } else if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) { if (attr->ia_size > isize) { i_size_write(inode, attr->ia_size); inode->i_blocks = calc_inode_blocks(attr->ia_size); @@ -2131,11 +2605,24 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) } } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || attr->ia_size != isize) { - req->r_args.setattr.size = cpu_to_le64(attr->ia_size); - req->r_args.setattr.old_size = cpu_to_le64(isize); mask |= CEPH_SETATTR_SIZE; release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; + if (IS_ENCRYPTED(inode) && attr->ia_size) { + set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); + mask |= CEPH_SETATTR_FSCRYPT_FILE; + req->r_args.setattr.size = + cpu_to_le64(round_up(attr->ia_size, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_args.setattr.old_size = + cpu_to_le64(round_up(isize, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_fscrypt_file = attr->ia_size; + } else { + req->r_args.setattr.size = cpu_to_le64(attr->ia_size); + req->r_args.setattr.old_size = cpu_to_le64(isize); + req->r_fscrypt_file = 0; + } } } if (ia_valid & ATTR_MTIME) { @@ -2166,7 +2653,8 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME| ATTR_MODE|ATTR_UID|ATTR_GID)) == 0; dout("setattr %p ctime %lld.%ld -> %lld.%ld (%s)\n", inode, - inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, + inode_get_ctime(inode).tv_sec, + inode_get_ctime(inode).tv_nsec, attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec, only ? "ctime only" : "ignored"); if (only) { @@ -2191,14 +2679,16 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) if (dirtied) { inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, &prealloc_cf); - inode->i_ctime = attr->ia_ctime; + inode_set_ctime_to_ts(inode, attr->ia_ctime); inode_inc_iversion_raw(inode); } release &= issued; spin_unlock(&ci->i_ceph_lock); - if (lock_snap_rwsem) + if (lock_snap_rwsem) { up_read(&mdsc->snap_rwsem); + lock_snap_rwsem = false; + } if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); @@ -2210,8 +2700,29 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; req->r_stamp = attr->ia_ctime; + if (fill_fscrypt) { + err = fill_fscrypt_truncate(inode, req, attr); + if (err) + goto out; + } + + /* + * The truncate request will return -EAGAIN when the + * last block has been updated just before the MDS + * successfully gets the xlock for the FILE lock. To + * avoid corrupting the file contents we need to retry + * it. + */ err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err == -EAGAIN && truncate_retry--) { + dout("setattr %p result=%d (%s locally, %d remote), retry it!\n", + inode, err, ceph_cap_string(dirtied), mask); + ceph_mdsc_put_request(req); + ceph_free_cap_flush(prealloc_cf); + goto retry; + } } +out: dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, ceph_cap_string(dirtied), mask); @@ -2240,6 +2751,10 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (ceph_inode_is_shutdown(inode)) return -ESTALE; + err = fscrypt_prepare_setattr(dentry, attr); + if (err) + return err; + err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err != 0) return err; @@ -2252,7 +2767,7 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry, ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size)) return -EDQUOT; - err = __ceph_setattr(inode, attr); + err = __ceph_setattr(inode, attr, NULL); if (err >= 0 && (attr->ia_valid & ATTR_MODE)) err = posix_acl_chmod(&nop_mnt_idmap, dentry, attr->ia_mode); @@ -2465,7 +2980,7 @@ int ceph_getattr(struct mnt_idmap *idmap, const struct path *path, return err; } - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->ino = ceph_present_inode(inode); /* @@ -2523,8 +3038,12 @@ int ceph_getattr(struct mnt_idmap *idmap, const struct path *path, stat->nlink = 1 + 1 + ci->i_subdirs; } - stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; + if (IS_ENCRYPTED(inode)) + stat->attributes |= STATX_ATTR_ENCRYPTED; + stat->attributes_mask |= (STATX_ATTR_CHANGE_MONOTONIC | + STATX_ATTR_ENCRYPTED); + stat->result_mask = request_mask & valid_mask; return err; } diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index deac817647eb..91a84917d203 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -6,6 +6,7 @@ #include "mds_client.h" #include "ioctl.h" #include <linux/ceph/striper.h> +#include <linux/fscrypt.h> /* * ioctls @@ -268,9 +269,96 @@ static long ceph_ioctl_syncio(struct file *file) return 0; } +static int vet_mds_for_fscrypt(struct file *file) +{ + int i, ret = -EOPNOTSUPP; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(file_inode(file)->i_sb); + + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + struct ceph_mds_session *s = mdsc->sessions[i]; + + if (!s) + continue; + if (test_bit(CEPHFS_FEATURE_ALTERNATE_NAME, &s->s_features)) + ret = 0; + break; + } + mutex_unlock(&mdsc->mutex); + return ret; +} + +static long ceph_set_encryption_policy(struct file *file, unsigned long arg) +{ + int ret, got = 0; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + + /* encrypted directories can't have striped layout */ + if (ci->i_layout.stripe_count > 1) + return -EINVAL; + + ret = vet_mds_for_fscrypt(file); + if (ret) + return ret; + + /* + * Ensure we hold these caps so that we _know_ that the rstats check + * in the empty_dir check is reliable. + */ + ret = ceph_get_caps(file, CEPH_CAP_FILE_SHARED, 0, -1, &got); + if (ret) + return ret; + + ret = fscrypt_ioctl_set_policy(file, (const void __user *)arg); + if (got) + ceph_put_cap_refs(ci, got); + + return ret; +} + +static const char *ceph_ioctl_cmd_name(const unsigned int cmd) +{ + switch (cmd) { + case CEPH_IOC_GET_LAYOUT: + return "get_layout"; + case CEPH_IOC_SET_LAYOUT: + return "set_layout"; + case CEPH_IOC_SET_LAYOUT_POLICY: + return "set_layout_policy"; + case CEPH_IOC_GET_DATALOC: + return "get_dataloc"; + case CEPH_IOC_LAZYIO: + return "lazyio"; + case CEPH_IOC_SYNCIO: + return "syncio"; + case FS_IOC_SET_ENCRYPTION_POLICY: + return "set_encryption_policy"; + case FS_IOC_GET_ENCRYPTION_POLICY: + return "get_encryption_policy"; + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + return "get_encryption_policy_ex"; + case FS_IOC_ADD_ENCRYPTION_KEY: + return "add_encryption_key"; + case FS_IOC_REMOVE_ENCRYPTION_KEY: + return "remove_encryption_key"; + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + return "remove_encryption_key_all_users"; + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + return "get_encryption_key_status"; + case FS_IOC_GET_ENCRYPTION_NONCE: + return "get_encryption_nonce"; + default: + return "unknown"; + } +} + long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg); + int ret; + + dout("ioctl file %p cmd %s arg %lu\n", file, + ceph_ioctl_cmd_name(cmd), arg); switch (cmd) { case CEPH_IOC_GET_LAYOUT: return ceph_ioctl_get_layout(file, (void __user *)arg); @@ -289,6 +377,43 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case CEPH_IOC_SYNCIO: return ceph_ioctl_syncio(file); + + case FS_IOC_SET_ENCRYPTION_POLICY: + return ceph_set_encryption_policy(file, arg); + + case FS_IOC_GET_ENCRYPTION_POLICY: + ret = vet_mds_for_fscrypt(file); + if (ret) + return ret; + return fscrypt_ioctl_get_policy(file, (void __user *)arg); + + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + ret = vet_mds_for_fscrypt(file); + if (ret) + return ret; + return fscrypt_ioctl_get_policy_ex(file, (void __user *)arg); + + case FS_IOC_ADD_ENCRYPTION_KEY: + ret = vet_mds_for_fscrypt(file); + if (ret) + return ret; + return fscrypt_ioctl_add_key(file, (void __user *)arg); + + case FS_IOC_REMOVE_ENCRYPTION_KEY: + return fscrypt_ioctl_remove_key(file, (void __user *)arg); + + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + return fscrypt_ioctl_remove_key_all_users(file, + (void __user *)arg); + + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + return fscrypt_ioctl_get_key_status(file, (void __user *)arg); + + case FS_IOC_GET_ENCRYPTION_NONCE: + ret = vet_mds_for_fscrypt(file); + if (ret) + return ret; + return fscrypt_ioctl_get_nonce(file, (void __user *)arg); } return -ENOTTY; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 5fb367b1d4b0..615db141b6c4 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -15,6 +15,7 @@ #include "super.h" #include "mds_client.h" +#include "crypto.h" #include <linux/ceph/ceph_features.h> #include <linux/ceph/messenger.h> @@ -184,8 +185,54 @@ static int parse_reply_info_in(void **p, void *end, info->rsnaps = 0; } + if (struct_v >= 5) { + u32 alen; + + ceph_decode_32_safe(p, end, alen, bad); + + while (alen--) { + u32 len; + + /* key */ + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_skip_n(p, end, len, bad); + /* value */ + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_skip_n(p, end, len, bad); + } + } + + /* fscrypt flag -- ignore */ + if (struct_v >= 6) + ceph_decode_skip_8(p, end, bad); + + info->fscrypt_auth = NULL; + info->fscrypt_auth_len = 0; + info->fscrypt_file = NULL; + info->fscrypt_file_len = 0; + if (struct_v >= 7) { + ceph_decode_32_safe(p, end, info->fscrypt_auth_len, bad); + if (info->fscrypt_auth_len) { + info->fscrypt_auth = kmalloc(info->fscrypt_auth_len, + GFP_KERNEL); + if (!info->fscrypt_auth) + return -ENOMEM; + ceph_decode_copy_safe(p, end, info->fscrypt_auth, + info->fscrypt_auth_len, bad); + } + ceph_decode_32_safe(p, end, info->fscrypt_file_len, bad); + if (info->fscrypt_file_len) { + info->fscrypt_file = kmalloc(info->fscrypt_file_len, + GFP_KERNEL); + if (!info->fscrypt_file) + return -ENOMEM; + ceph_decode_copy_safe(p, end, info->fscrypt_file, + info->fscrypt_file_len, bad); + } + } *p = end; } else { + /* legacy (unversioned) struct */ if (features & CEPH_FEATURE_MDS_INLINE_DATA) { ceph_decode_64_safe(p, end, info->inline_version, bad); ceph_decode_32_safe(p, end, info->inline_len, bad); @@ -263,27 +310,47 @@ bad: static int parse_reply_info_lease(void **p, void *end, struct ceph_mds_reply_lease **lease, - u64 features) + u64 features, u32 *altname_len, u8 **altname) { + u8 struct_v; + u32 struct_len; + void *lend; + if (features == (u64)-1) { - u8 struct_v, struct_compat; - u32 struct_len; + u8 struct_compat; + ceph_decode_8_safe(p, end, struct_v, bad); ceph_decode_8_safe(p, end, struct_compat, bad); + /* struct_v is expected to be >= 1. we only understand * encoding whose struct_compat == 1. */ if (!struct_v || struct_compat != 1) goto bad; + ceph_decode_32_safe(p, end, struct_len, bad); - ceph_decode_need(p, end, struct_len, bad); - end = *p + struct_len; + } else { + struct_len = sizeof(**lease); + *altname_len = 0; + *altname = NULL; } - ceph_decode_need(p, end, sizeof(**lease), bad); + lend = *p + struct_len; + ceph_decode_need(p, end, struct_len, bad); *lease = *p; *p += sizeof(**lease); - if (features == (u64)-1) - *p = end; + + if (features == (u64)-1) { + if (struct_v >= 2) { + ceph_decode_32_safe(p, end, *altname_len, bad); + ceph_decode_need(p, end, *altname_len, bad); + *altname = *p; + *p += *altname_len; + } else { + *altname = NULL; + *altname_len = 0; + } + } + *p = lend; return 0; bad: return -EIO; @@ -313,7 +380,8 @@ static int parse_reply_info_trace(void **p, void *end, info->dname = *p; *p += info->dname_len; - err = parse_reply_info_lease(p, end, &info->dlease, features); + err = parse_reply_info_lease(p, end, &info->dlease, features, + &info->altname_len, &info->altname); if (err < 0) goto out_bad; } @@ -339,9 +407,10 @@ out_bad: * parse readdir results */ static int parse_reply_info_readdir(void **p, void *end, - struct ceph_mds_reply_info_parsed *info, - u64 features) + struct ceph_mds_request *req, + u64 features) { + struct ceph_mds_reply_info_parsed *info = &req->r_reply_info; u32 num, i = 0; int err; @@ -371,18 +440,87 @@ static int parse_reply_info_readdir(void **p, void *end, info->dir_nr = num; while (num) { + struct inode *inode = d_inode(req->r_dentry); + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i; + struct fscrypt_str tname = FSTR_INIT(NULL, 0); + struct fscrypt_str oname = FSTR_INIT(NULL, 0); + struct ceph_fname fname; + u32 altname_len, _name_len; + u8 *altname, *_name; + /* dentry */ - ceph_decode_32_safe(p, end, rde->name_len, bad); - ceph_decode_need(p, end, rde->name_len, bad); - rde->name = *p; - *p += rde->name_len; - dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name); + ceph_decode_32_safe(p, end, _name_len, bad); + ceph_decode_need(p, end, _name_len, bad); + _name = *p; + *p += _name_len; + dout("parsed dir dname '%.*s'\n", _name_len, _name); + + if (info->hash_order) + rde->raw_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, + _name, _name_len); /* dentry lease */ - err = parse_reply_info_lease(p, end, &rde->lease, features); + err = parse_reply_info_lease(p, end, &rde->lease, features, + &altname_len, &altname); if (err) goto out_bad; + + /* + * Try to dencrypt the dentry names and update them + * in the ceph_mds_reply_dir_entry struct. + */ + fname.dir = inode; + fname.name = _name; + fname.name_len = _name_len; + fname.ctext = altname; + fname.ctext_len = altname_len; + /* + * The _name_len maybe larger than altname_len, such as + * when the human readable name length is in range of + * (CEPH_NOHASH_NAME_MAX, CEPH_NOHASH_NAME_MAX + SHA256_DIGEST_SIZE), + * then the copy in ceph_fname_to_usr will corrupt the + * data if there has no encryption key. + * + * Just set the no_copy flag and then if there has no + * encryption key the oname.name will be assigned to + * _name always. + */ + fname.no_copy = true; + if (altname_len == 0) { + /* + * Set tname to _name, and this will be used + * to do the base64_decode in-place. It's + * safe because the decoded string should + * always be shorter, which is 3/4 of origin + * string. + */ + tname.name = _name; + + /* + * Set oname to _name too, and this will be + * used to do the dencryption in-place. + */ + oname.name = _name; + oname.len = _name_len; + } else { + /* + * This will do the decryption only in-place + * from altname cryptext directly. + */ + oname.name = altname; + oname.len = altname_len; + } + rde->is_nokey = false; + err = ceph_fname_to_usr(&fname, &tname, &oname, &rde->is_nokey); + if (err) { + pr_err("%s unable to decode %.*s, got %d\n", __func__, + _name_len, _name, err); + goto out_bad; + } + rde->name = oname.name; + rde->name_len = oname.len; + /* inode */ err = parse_reply_info_in(p, end, &rde->inode, features); if (err < 0) @@ -581,15 +719,16 @@ bad: * parse extra results */ static int parse_reply_info_extra(void **p, void *end, - struct ceph_mds_reply_info_parsed *info, + struct ceph_mds_request *req, u64 features, struct ceph_mds_session *s) { + struct ceph_mds_reply_info_parsed *info = &req->r_reply_info; u32 op = le32_to_cpu(info->head->op); if (op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP) - return parse_reply_info_readdir(p, end, info, features); + return parse_reply_info_readdir(p, end, req, features); else if (op == CEPH_MDS_OP_CREATE) return parse_reply_info_create(p, end, info, features, s); else if (op == CEPH_MDS_OP_GETVXATTR) @@ -602,9 +741,9 @@ static int parse_reply_info_extra(void **p, void *end, * parse entire mds reply */ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg, - struct ceph_mds_reply_info_parsed *info, - u64 features) + struct ceph_mds_request *req, u64 features) { + struct ceph_mds_reply_info_parsed *info = &req->r_reply_info; void *p, *end; u32 len; int err; @@ -626,7 +765,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg, ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { ceph_decode_need(&p, end, len, bad); - err = parse_reply_info_extra(&p, p+len, info, features, s); + err = parse_reply_info_extra(&p, p+len, req, features, s); if (err < 0) goto out_bad; } @@ -651,8 +790,21 @@ out_bad: static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) { + int i; + + kfree(info->diri.fscrypt_auth); + kfree(info->diri.fscrypt_file); + kfree(info->targeti.fscrypt_auth); + kfree(info->targeti.fscrypt_file); if (!info->dir_entries) return; + + for (i = 0; i < info->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i; + + kfree(rde->inode.fscrypt_auth); + kfree(rde->inode.fscrypt_file); + } free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size)); } @@ -945,6 +1097,7 @@ void ceph_mdsc_release_request(struct kref *kref) iput(req->r_parent); } iput(req->r_target_inode); + iput(req->r_new_inode); if (req->r_dentry) dput(req->r_dentry); if (req->r_old_dentry) @@ -965,6 +1118,8 @@ void ceph_mdsc_release_request(struct kref *kref) put_cred(req->r_cred); if (req->r_pagelist) ceph_pagelist_release(req->r_pagelist); + kfree(req->r_fscrypt_auth); + kfree(req->r_altname); put_request_session(req); ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation); WARN_ON_ONCE(!list_empty(&req->r_wait)); @@ -2373,20 +2528,90 @@ static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc) return mdsc->oldest_tid; } -/* - * Build a dentry's path. Allocate on heap; caller must kfree. Based - * on build_path_from_dentry in fs/cifs/dir.c. +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) +{ + struct inode *dir = req->r_parent; + struct dentry *dentry = req->r_dentry; + u8 *cryptbuf = NULL; + u32 len = 0; + int ret = 0; + + /* only encode if we have parent and dentry */ + if (!dir || !dentry) + goto success; + + /* No-op unless this is encrypted */ + if (!IS_ENCRYPTED(dir)) + goto success; + + ret = ceph_fscrypt_prepare_readdir(dir); + if (ret < 0) + return ERR_PTR(ret); + + /* No key? Just ignore it. */ + if (!fscrypt_has_encryption_key(dir)) + goto success; + + if (!fscrypt_fname_encrypted_size(dir, dentry->d_name.len, NAME_MAX, + &len)) { + WARN_ON_ONCE(1); + return ERR_PTR(-ENAMETOOLONG); + } + + /* No need to append altname if name is short enough */ + if (len <= CEPH_NOHASH_NAME_MAX) { + len = 0; + goto success; + } + + cryptbuf = kmalloc(len, GFP_KERNEL); + if (!cryptbuf) + return ERR_PTR(-ENOMEM); + + ret = fscrypt_fname_encrypt(dir, &dentry->d_name, cryptbuf, len); + if (ret) { + kfree(cryptbuf); + return ERR_PTR(ret); + } +success: + *plen = len; + return cryptbuf; +} +#else +static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) +{ + *plen = 0; + return NULL; +} +#endif + +/** + * ceph_mdsc_build_path - build a path string to a given dentry + * @dentry: dentry to which path should be built + * @plen: returned length of string + * @pbase: returned base inode number + * @for_wire: is this path going to be sent to the MDS? + * + * Build a string that represents the path to the dentry. This is mostly called + * for two different purposes: + * + * 1) we need to build a path string to send to the MDS (for_wire == true) + * 2) we need a path string for local presentation (e.g. debugfs) + * (for_wire == false) * - * If @stop_on_nosnap, generate path relative to the first non-snapped - * inode. + * The path is built in reverse, starting with the dentry. Walk back up toward + * the root, building the path until the first non-snapped inode is reached + * (for_wire) or the root inode is reached (!for_wire). * * Encode hidden .snap dirs as a double /, i.e. * foo/.snap/bar -> foo//bar */ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *pbase, - int stop_on_nosnap) + int for_wire) { - struct dentry *temp; + struct dentry *cur; + struct inode *inode; char *path; int pos; unsigned seq; @@ -2403,34 +2628,72 @@ retry: path[pos] = '\0'; seq = read_seqbegin(&rename_lock); - rcu_read_lock(); - temp = dentry; + cur = dget(dentry); for (;;) { - struct inode *inode; + struct dentry *parent; - spin_lock(&temp->d_lock); - inode = d_inode(temp); + spin_lock(&cur->d_lock); + inode = d_inode(cur); if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { dout("build_path path+%d: %p SNAPDIR\n", - pos, temp); - } else if (stop_on_nosnap && inode && dentry != temp && + pos, cur); + spin_unlock(&cur->d_lock); + parent = dget_parent(cur); + } else if (for_wire && inode && dentry != cur && ceph_snap(inode) == CEPH_NOSNAP) { - spin_unlock(&temp->d_lock); + spin_unlock(&cur->d_lock); pos++; /* get rid of any prepended '/' */ break; + } else if (!for_wire || !IS_ENCRYPTED(d_inode(cur->d_parent))) { + pos -= cur->d_name.len; + if (pos < 0) { + spin_unlock(&cur->d_lock); + break; + } + memcpy(path + pos, cur->d_name.name, cur->d_name.len); + spin_unlock(&cur->d_lock); + parent = dget_parent(cur); } else { - pos -= temp->d_name.len; + int len, ret; + char buf[NAME_MAX]; + + /* + * Proactively copy name into buf, in case we need to + * present it as-is. + */ + memcpy(buf, cur->d_name.name, cur->d_name.len); + len = cur->d_name.len; + spin_unlock(&cur->d_lock); + parent = dget_parent(cur); + + ret = ceph_fscrypt_prepare_readdir(d_inode(parent)); + if (ret < 0) { + dput(parent); + dput(cur); + return ERR_PTR(ret); + } + + if (fscrypt_has_encryption_key(d_inode(parent))) { + len = ceph_encode_encrypted_fname(d_inode(parent), + cur, buf); + if (len < 0) { + dput(parent); + dput(cur); + return ERR_PTR(len); + } + } + pos -= len; if (pos < 0) { - spin_unlock(&temp->d_lock); + dput(parent); break; } - memcpy(path + pos, temp->d_name.name, temp->d_name.len); + memcpy(path + pos, buf, len); } - spin_unlock(&temp->d_lock); - temp = READ_ONCE(temp->d_parent); + dput(cur); + cur = parent; /* Are we at the root? */ - if (IS_ROOT(temp)) + if (IS_ROOT(cur)) break; /* Are we out of buffer? */ @@ -2439,8 +2702,9 @@ retry: path[pos] = '/'; } - base = ceph_ino(d_inode(temp)); - rcu_read_unlock(); + inode = d_inode(cur); + base = inode ? ceph_ino(inode) : 0; + dput(cur); if (read_seqretry(&rename_lock, seq)) goto retry; @@ -2450,8 +2714,8 @@ retry: * A rename didn't occur, but somehow we didn't end up where * we thought we would. Throw a warning and try again. */ - pr_warn("build_path did not end path lookup where " - "expected, pos is %d\n", pos); + pr_warn("build_path did not end path lookup where expected (pos = %d)\n", + pos); goto retry; } @@ -2471,7 +2735,8 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, rcu_read_lock(); if (!dir) dir = d_inode_rcu(dentry->d_parent); - if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP) { + if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP && + !IS_ENCRYPTED(dir)) { *pino = ceph_ino(dir); rcu_read_unlock(); *ppath = dentry->d_name.name; @@ -2539,8 +2804,8 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, return r; } -static void encode_timestamp_and_gids(void **p, - const struct ceph_mds_request *req) +static void encode_mclientrequest_tail(void **p, + const struct ceph_mds_request *req) { struct ceph_timespec ts; int i; @@ -2548,11 +2813,43 @@ static void encode_timestamp_and_gids(void **p, ceph_encode_timespec64(&ts, &req->r_stamp); ceph_encode_copy(p, &ts, sizeof(ts)); - /* gid_list */ + /* v4: gid_list */ ceph_encode_32(p, req->r_cred->group_info->ngroups); for (i = 0; i < req->r_cred->group_info->ngroups; i++) ceph_encode_64(p, from_kgid(&init_user_ns, req->r_cred->group_info->gid[i])); + + /* v5: altname */ + ceph_encode_32(p, req->r_altname_len); + ceph_encode_copy(p, req->r_altname, req->r_altname_len); + + /* v6: fscrypt_auth and fscrypt_file */ + if (req->r_fscrypt_auth) { + u32 authlen = ceph_fscrypt_auth_len(req->r_fscrypt_auth); + + ceph_encode_32(p, authlen); + ceph_encode_copy(p, req->r_fscrypt_auth, authlen); + } else { + ceph_encode_32(p, 0); + } + if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags)) { + ceph_encode_32(p, sizeof(__le64)); + ceph_encode_64(p, req->r_fscrypt_file); + } else { + ceph_encode_32(p, 0); + } +} + +static struct ceph_mds_request_head_legacy * +find_legacy_request_head(void *p, u64 features) +{ + bool legacy = !(features & CEPH_FEATURE_FS_BTIME); + struct ceph_mds_request_head_old *ohead; + + if (legacy) + return (struct ceph_mds_request_head_legacy *)p; + ohead = (struct ceph_mds_request_head_old *)p; + return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid; } /* @@ -2565,7 +2862,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, int mds = session->s_mds; struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_msg *msg; - struct ceph_mds_request_head_old *head; + struct ceph_mds_request_head_legacy *lhead; const char *path1 = NULL; const char *path2 = NULL; u64 ino1 = 0, ino2 = 0; @@ -2577,6 +2874,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, void *p, *end; int ret; bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME); + bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, + &session->s_features); ret = set_request_path_attr(req->r_inode, req->r_dentry, req->r_parent, req->r_path1, req->r_ino1.ino, @@ -2601,12 +2900,32 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, goto out_free1; } - len = legacy ? sizeof(*head) : sizeof(struct ceph_mds_request_head); - len += pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) + - sizeof(struct ceph_timespec); - len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups); + req->r_altname = get_fscrypt_altname(req, &req->r_altname_len); + if (IS_ERR(req->r_altname)) { + msg = ERR_CAST(req->r_altname); + req->r_altname = NULL; + goto out_free2; + } + + /* + * For old cephs without supporting the 32bit retry/fwd feature + * it will copy the raw memories directly when decoding the + * requests. While new cephs will decode the head depending the + * version member, so we need to make sure it will be compatible + * with them both. + */ + if (legacy) + len = sizeof(struct ceph_mds_request_head_legacy); + else if (old_version) + len = sizeof(struct ceph_mds_request_head_old); + else + len = sizeof(struct ceph_mds_request_head); - /* calculate (max) length for cap releases */ + /* filepaths */ + len += 2 * (1 + sizeof(u32) + sizeof(u64)); + len += pathlen1 + pathlen2; + + /* cap releases */ len += sizeof(struct ceph_mds_request_release) * (!!req->r_inode_drop + !!req->r_dentry_drop + !!req->r_old_inode_drop + !!req->r_old_dentry_drop); @@ -2616,6 +2935,27 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, if (req->r_old_dentry_drop) len += pathlen2; + /* MClientRequest tail */ + + /* req->r_stamp */ + len += sizeof(struct ceph_timespec); + + /* gid list */ + len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups); + + /* alternate name */ + len += sizeof(u32) + req->r_altname_len; + + /* fscrypt_auth */ + len += sizeof(u32); // fscrypt_auth + if (req->r_fscrypt_auth) + len += ceph_fscrypt_auth_len(req->r_fscrypt_auth); + + /* fscrypt_file */ + len += sizeof(u32); + if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags)) + len += sizeof(__le64); + msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false); if (!msg) { msg = ERR_PTR(-ENOMEM); @@ -2624,33 +2964,40 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, msg->hdr.tid = cpu_to_le64(req->r_tid); + lhead = find_legacy_request_head(msg->front.iov_base, + session->s_con.peer_features); + /* - * The old ceph_mds_request_head didn't contain a version field, and + * The ceph_mds_request_head_legacy didn't contain a version field, and * one was added when we moved the message version from 3->4. */ if (legacy) { msg->hdr.version = cpu_to_le16(3); - head = msg->front.iov_base; - p = msg->front.iov_base + sizeof(*head); - } else { - struct ceph_mds_request_head *new_head = msg->front.iov_base; + p = msg->front.iov_base + sizeof(*lhead); + } else if (old_version) { + struct ceph_mds_request_head_old *ohead = msg->front.iov_base; msg->hdr.version = cpu_to_le16(4); - new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); - head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid; - p = msg->front.iov_base + sizeof(*new_head); + ohead->version = cpu_to_le16(1); + p = msg->front.iov_base + sizeof(*ohead); + } else { + struct ceph_mds_request_head *nhead = msg->front.iov_base; + + msg->hdr.version = cpu_to_le16(6); + nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); + p = msg->front.iov_base + sizeof(*nhead); } end = msg->front.iov_base + msg->front.iov_len; - head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); - head->op = cpu_to_le32(req->r_op); - head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, - req->r_cred->fsuid)); - head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, - req->r_cred->fsgid)); - head->ino = cpu_to_le64(req->r_deleg_ino); - head->args = req->r_args; + lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); + lhead->op = cpu_to_le32(req->r_op); + lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, + req->r_cred->fsuid)); + lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, + req->r_cred->fsgid)); + lhead->ino = cpu_to_le64(req->r_deleg_ino); + lhead->args = req->r_args; ceph_encode_filepath(&p, end, ino1, path1); ceph_encode_filepath(&p, end, ino2, path2); @@ -2665,15 +3012,23 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, req->r_inode ? req->r_inode : d_inode(req->r_dentry), mds, req->r_inode_drop, req->r_inode_unless, req->r_op == CEPH_MDS_OP_READDIR); - if (req->r_dentry_drop) - releases += ceph_encode_dentry_release(&p, req->r_dentry, + if (req->r_dentry_drop) { + ret = ceph_encode_dentry_release(&p, req->r_dentry, req->r_parent, mds, req->r_dentry_drop, req->r_dentry_unless); - if (req->r_old_dentry_drop) - releases += ceph_encode_dentry_release(&p, req->r_old_dentry, + if (ret < 0) + goto out_err; + releases += ret; + } + if (req->r_old_dentry_drop) { + ret = ceph_encode_dentry_release(&p, req->r_old_dentry, req->r_old_dentry_dir, mds, req->r_old_dentry_drop, req->r_old_dentry_unless); + if (ret < 0) + goto out_err; + releases += ret; + } if (req->r_old_inode_drop) releases += ceph_encode_inode_release(&p, d_inode(req->r_old_dentry), @@ -2684,9 +3039,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, p = msg->front.iov_base + req->r_request_release_offset; } - head->num_releases = cpu_to_le16(releases); + lhead->num_releases = cpu_to_le16(releases); - encode_timestamp_and_gids(&p, req); + encode_mclientrequest_tail(&p, req); if (WARN_ON_ONCE(p > end)) { ceph_msg_put(msg); @@ -2715,6 +3070,10 @@ out_free1: ceph_mdsc_free_path((char *)path1, pathlen1); out: return msg; +out_err: + ceph_msg_put(msg); + msg = ERR_PTR(ret); + goto out_free2; } /* @@ -2731,18 +3090,6 @@ static void complete_request(struct ceph_mds_client *mdsc, complete_all(&req->r_completion); } -static struct ceph_mds_request_head_old * -find_old_request_head(void *p, u64 features) -{ - bool legacy = !(features & CEPH_FEATURE_FS_BTIME); - struct ceph_mds_request_head *new_head; - - if (legacy) - return (struct ceph_mds_request_head_old *)p; - new_head = (struct ceph_mds_request_head *)p; - return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid; -} - /* * called under mdsc->mutex */ @@ -2752,29 +3099,28 @@ static int __prepare_send_request(struct ceph_mds_session *session, { int mds = session->s_mds; struct ceph_mds_client *mdsc = session->s_mdsc; - struct ceph_mds_request_head_old *rhead; + struct ceph_mds_request_head_legacy *lhead; + struct ceph_mds_request_head *nhead; struct ceph_msg *msg; - int flags = 0, max_retry; + int flags = 0, old_max_retry; + bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, + &session->s_features); /* - * The type of 'r_attempts' in kernel 'ceph_mds_request' - * is 'int', while in 'ceph_mds_request_head' the type of - * 'num_retry' is '__u8'. So in case the request retries - * exceeding 256 times, the MDS will receive a incorrect - * retry seq. - * - * In this case it's ususally a bug in MDS and continue - * retrying the request makes no sense. - * - * In future this could be fixed in ceph code, so avoid - * using the hardcode here. + * Avoid inifinite retrying after overflow. The client will + * increase the retry count and if the MDS is old version, + * so we limit to retry at most 256 times. */ - max_retry = sizeof_field(struct ceph_mds_request_head, num_retry); - max_retry = 1 << (max_retry * BITS_PER_BYTE); - if (req->r_attempts >= max_retry) { - pr_warn_ratelimited("%s request tid %llu seq overflow\n", - __func__, req->r_tid); - return -EMULTIHOP; + if (req->r_attempts) { + old_max_retry = sizeof_field(struct ceph_mds_request_head_old, + num_retry); + old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE); + if ((old_version && req->r_attempts >= old_max_retry) || + ((uint32_t)req->r_attempts >= U32_MAX)) { + pr_warn_ratelimited("%s request tid %llu seq overflow\n", + __func__, req->r_tid); + return -EMULTIHOP; + } } req->r_attempts++; @@ -2800,23 +3146,27 @@ static int __prepare_send_request(struct ceph_mds_session *session, * d_move mangles the src name. */ msg = req->r_request; - rhead = find_old_request_head(msg->front.iov_base, - session->s_con.peer_features); + lhead = find_legacy_request_head(msg->front.iov_base, + session->s_con.peer_features); - flags = le32_to_cpu(rhead->flags); + flags = le32_to_cpu(lhead->flags); flags |= CEPH_MDS_FLAG_REPLAY; - rhead->flags = cpu_to_le32(flags); + lhead->flags = cpu_to_le32(flags); if (req->r_target_inode) - rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); + lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); - rhead->num_retry = req->r_attempts - 1; + lhead->num_retry = req->r_attempts - 1; + if (!old_version) { + nhead = (struct ceph_mds_request_head*)msg->front.iov_base; + nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1); + } /* remove cap/dentry releases from message */ - rhead->num_releases = 0; + lhead->num_releases = 0; p = msg->front.iov_base + req->r_request_release_offset; - encode_timestamp_and_gids(&p, req); + encode_mclientrequest_tail(&p, req); msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); @@ -2834,18 +3184,23 @@ static int __prepare_send_request(struct ceph_mds_session *session, } req->r_request = msg; - rhead = find_old_request_head(msg->front.iov_base, - session->s_con.peer_features); - rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); + lhead = find_legacy_request_head(msg->front.iov_base, + session->s_con.peer_features); + lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) flags |= CEPH_MDS_FLAG_REPLAY; if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) flags |= CEPH_MDS_FLAG_ASYNC; if (req->r_parent) flags |= CEPH_MDS_FLAG_WANT_DENTRY; - rhead->flags = cpu_to_le32(flags); - rhead->num_fwd = req->r_num_fwd; - rhead->num_retry = req->r_attempts - 1; + lhead->flags = cpu_to_le32(flags); + lhead->num_fwd = req->r_num_fwd; + lhead->num_retry = req->r_attempts - 1; + if (!old_version) { + nhead = (struct ceph_mds_request_head*)msg->front.iov_base; + nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd); + nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1); + } dout(" r_parent = %p\n", req->r_parent); return 0; @@ -3348,22 +3703,35 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) } dout("handle_reply tid %lld result %d\n", tid, result); - rinfo = &req->r_reply_info; if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features)) - err = parse_reply_info(session, msg, rinfo, (u64)-1); + err = parse_reply_info(session, msg, req, (u64)-1); else - err = parse_reply_info(session, msg, rinfo, session->s_con.peer_features); + err = parse_reply_info(session, msg, req, + session->s_con.peer_features); mutex_unlock(&mdsc->mutex); /* Must find target inode outside of mutexes to avoid deadlocks */ + rinfo = &req->r_reply_info; if ((err >= 0) && rinfo->head->is_target) { - struct inode *in; + struct inode *in = xchg(&req->r_new_inode, NULL); struct ceph_vino tvino = { .ino = le64_to_cpu(rinfo->targeti.in->ino), .snap = le64_to_cpu(rinfo->targeti.in->snapid) }; - in = ceph_get_inode(mdsc->fsc->sb, tvino); + /* + * If we ended up opening an existing inode, discard + * r_new_inode + */ + if (req->r_op == CEPH_MDS_OP_CREATE && + !req->r_reply_info.has_create_ino) { + /* This should never happen on an async create */ + WARN_ON_ONCE(req->r_deleg_ino); + iput(in); + in = NULL; + } + + in = ceph_get_inode(mdsc->fsc->sb, tvino, in); if (IS_ERR(in)) { err = PTR_ERR(in); mutex_lock(&session->s_mutex); @@ -3406,7 +3774,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (err == 0) { if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || req->r_op == CEPH_MDS_OP_LSSNAP)) - ceph_readdir_prepopulate(req, req->r_session); + err = ceph_readdir_prepopulate(req, req->r_session); } current->journal_info = NULL; mutex_unlock(&req->r_fill_mutex); @@ -3491,33 +3859,21 @@ static void handle_forward(struct ceph_mds_client *mdsc, if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { dout("forward tid %llu aborted, unregistering\n", tid); __unregister_request(mdsc, req); - } else if (fwd_seq <= req->r_num_fwd) { + } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) { /* - * The type of 'num_fwd' in ceph 'MClientRequestForward' - * is 'int32_t', while in 'ceph_mds_request_head' the - * type is '__u8'. So in case the request bounces between - * MDSes exceeding 256 times, the client will get stuck. - * - * In this case it's ususally a bug in MDS and continue - * bouncing the request makes no sense. + * Avoid inifinite retrying after overflow. * - * In future this could be fixed in ceph code, so avoid - * using the hardcode here. + * The MDS will increase the fwd count and in client side + * if the num_fwd is less than the one saved in request + * that means the MDS is an old version and overflowed of + * 8 bits. */ - int max = sizeof_field(struct ceph_mds_request_head, num_fwd); - max = 1 << (max * BITS_PER_BYTE); - if (req->r_num_fwd >= max) { - mutex_lock(&req->r_fill_mutex); - req->r_err = -EMULTIHOP; - set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); - mutex_unlock(&req->r_fill_mutex); - aborted = true; - pr_warn_ratelimited("forward tid %llu seq overflow\n", - tid); - } else { - dout("forward tid %llu to mds%d - old seq %d <= %d\n", - tid, next_mds, req->r_num_fwd, fwd_seq); - } + mutex_lock(&req->r_fill_mutex); + req->r_err = -EMULTIHOP; + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); + mutex_unlock(&req->r_fill_mutex); + aborted = true; + pr_warn_ratelimited("forward tid %llu seq overflow\n", tid); } else { /* resend. forward race not possible; mds would drop */ dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); @@ -4550,6 +4906,9 @@ static void handle_lease(struct ceph_mds_client *mdsc, dout("handle_lease from mds%d\n", mds); + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; + /* decode */ if (msg->front.iov_len < sizeof(*h) + sizeof(u32)) goto bad; @@ -4568,8 +4927,6 @@ static void handle_lease(struct ceph_mds_client *mdsc, dname.len, dname.name); mutex_lock(&session->s_mutex); - inc_session_sequence(session); - if (!inode) { dout("handle_lease no inode %llx\n", vino.ino); goto release; @@ -4631,9 +4988,13 @@ release: out: mutex_unlock(&session->s_mutex); iput(inode); + + ceph_dec_mds_stopping_blocker(mdsc); return; bad: + ceph_dec_mds_stopping_blocker(mdsc); + pr_err("corrupt lease message\n"); ceph_msg_dump(msg); } @@ -4829,6 +5190,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) } init_completion(&mdsc->safe_umount_waiters); + spin_lock_init(&mdsc->stopping_lock); + atomic_set(&mdsc->stopping_blockers, 0); + init_completion(&mdsc->stopping_waiter); init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); mdsc->quotarealms_inodes = RB_ROOT; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 86d2965e68a1..5a3714bdd64a 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -32,8 +32,9 @@ enum ceph_feature_type { CEPHFS_FEATURE_ALTERNATE_NAME, CEPHFS_FEATURE_NOTIFY_SESSION_STATE, CEPHFS_FEATURE_OP_GETVXATTR, + CEPHFS_FEATURE_32BITS_RETRY_FWD, - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR, + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD, }; #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ @@ -44,8 +45,10 @@ enum ceph_feature_type { CEPHFS_FEATURE_MULTI_RECONNECT, \ CEPHFS_FEATURE_DELEG_INO, \ CEPHFS_FEATURE_METRIC_COLLECT, \ + CEPHFS_FEATURE_ALTERNATE_NAME, \ CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \ CEPHFS_FEATURE_OP_GETVXATTR, \ + CEPHFS_FEATURE_32BITS_RETRY_FWD, \ } /* @@ -86,13 +89,19 @@ struct ceph_mds_reply_info_in { s32 dir_pin; struct ceph_timespec btime; struct ceph_timespec snap_btime; + u8 *fscrypt_auth; + u8 *fscrypt_file; + u32 fscrypt_auth_len; + u32 fscrypt_file_len; u64 rsnaps; u64 change_attr; }; struct ceph_mds_reply_dir_entry { + bool is_nokey; char *name; u32 name_len; + u32 raw_hash; struct ceph_mds_reply_lease *lease; struct ceph_mds_reply_info_in inode; loff_t offset; @@ -116,7 +125,9 @@ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_info_in diri, targeti; struct ceph_mds_reply_dirfrag *dirfrag; char *dname; + u8 *altname; u32 dname_len; + u32 altname_len; struct ceph_mds_reply_lease *dlease; struct ceph_mds_reply_xattr xattr_info; @@ -263,6 +274,7 @@ struct ceph_mds_request { struct inode *r_parent; /* parent dir inode */ struct inode *r_target_inode; /* resulting inode */ + struct inode *r_new_inode; /* new inode (for creates) */ #define CEPH_MDS_R_DIRECT_IS_HASH (1) /* r_direct_hash is valid */ #define CEPH_MDS_R_ABORTED (2) /* call was aborted */ @@ -272,11 +284,19 @@ struct ceph_mds_request { #define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */ #define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */ #define CEPH_MDS_R_ASYNC (8) /* async request */ +#define CEPH_MDS_R_FSCRYPT_FILE (9) /* must marshal fscrypt_file field */ unsigned long r_req_flags; struct mutex r_fill_mutex; union ceph_mds_request_args r_args; + + struct ceph_fscrypt_auth *r_fscrypt_auth; + u64 r_fscrypt_file; + + u8 *r_altname; /* fscrypt binary crypttext for long filenames */ + u32 r_altname_len; /* length of r_altname */ + int r_fmode; /* file mode, if expecting cap */ int r_request_release_offset; const struct cred *r_cred; @@ -381,8 +401,9 @@ struct cap_wait { }; enum { - CEPH_MDSC_STOPPING_BEGIN = 1, - CEPH_MDSC_STOPPING_FLUSHED = 2, + CEPH_MDSC_STOPPING_BEGIN = 1, + CEPH_MDSC_STOPPING_FLUSHING = 2, + CEPH_MDSC_STOPPING_FLUSHED = 3, }; /* @@ -401,7 +422,11 @@ struct ceph_mds_client { struct ceph_mds_session **sessions; /* NULL for mds if no session */ atomic_t num_sessions; int max_sessions; /* len of sessions array */ - int stopping; /* true if shutting down */ + + spinlock_t stopping_lock; /* protect snap_empty */ + int stopping; /* the stage of shutting down */ + atomic_t stopping_blockers; + struct completion stopping_waiter; atomic64_t quotarealms_count; /* # realms with quota */ /* @@ -557,7 +582,7 @@ static inline void ceph_mdsc_free_path(char *path, int len) } extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, - int stop_on_nosnap); + int for_wire); extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 64592adfe48f..f7fcf7f08ec6 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -47,25 +47,23 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc, struct inode *inode; struct ceph_inode_info *ci; + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; + if (msg->front.iov_len < sizeof(*h)) { pr_err("%s corrupt message mds%d len %d\n", __func__, session->s_mds, (int)msg->front.iov_len); ceph_msg_dump(msg); - return; + goto out; } - /* increment msg sequence number */ - mutex_lock(&session->s_mutex); - inc_session_sequence(session); - mutex_unlock(&session->s_mutex); - /* lookup inode */ vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; inode = ceph_find_inode(sb, vino); if (!inode) { pr_warn("Failed to find inode %llu\n", vino.ino); - return; + goto out; } ci = ceph_inode(inode); @@ -78,6 +76,8 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc, spin_unlock(&ci->i_ceph_lock); iput(inode); +out: + ceph_dec_mds_stopping_blocker(mdsc); } static struct ceph_quotarealm_inode * diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 343d738448dc..813f21add992 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -660,7 +660,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, capsnap->size = i_size_read(inode); capsnap->mtime = inode->i_mtime; capsnap->atime = inode->i_atime; - capsnap->ctime = inode->i_ctime; + capsnap->ctime = inode_get_ctime(inode); capsnap->btime = ci->i_btime; capsnap->change_attr = inode_peek_iversion_raw(inode); capsnap->time_warp_seq = ci->i_time_warp_seq; @@ -1015,6 +1015,9 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, int locked_rwsem = 0; bool close_sessions = false; + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; + /* decode */ if (msg->front.iov_len < sizeof(*h)) goto bad; @@ -1030,10 +1033,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, dout("%s from mds%d op %s split %llx tracelen %d\n", __func__, mds, ceph_snap_op_name(op), split, trace_len); - mutex_lock(&session->s_mutex); - inc_session_sequence(session); - mutex_unlock(&session->s_mutex); - down_write(&mdsc->snap_rwsem); locked_rwsem = 1; @@ -1151,6 +1150,7 @@ skip_inode: up_write(&mdsc->snap_rwsem); flush_snaps(mdsc); + ceph_dec_mds_stopping_blocker(mdsc); return; bad: @@ -1160,6 +1160,8 @@ out: if (locked_rwsem) up_write(&mdsc->snap_rwsem); + ceph_dec_mds_stopping_blocker(mdsc); + if (close_sessions) ceph_mdsc_close_sessions(mdsc); return; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index a5f52013314d..2d7f5a8d4a92 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -20,6 +20,7 @@ #include "super.h" #include "mds_client.h" #include "cache.h" +#include "crypto.h" #include <linux/ceph/ceph_features.h> #include <linux/ceph/decode.h> @@ -46,6 +47,7 @@ static void ceph_put_super(struct super_block *s) struct ceph_fs_client *fsc = ceph_sb_to_client(s); dout("put_super\n"); + ceph_fscrypt_free_dummy_policy(fsc); ceph_mdsc_close_sessions(fsc->mdsc); } @@ -151,6 +153,7 @@ enum { Opt_recover_session, Opt_source, Opt_mon_addr, + Opt_test_dummy_encryption, /* string args above */ Opt_dirstat, Opt_rbytes, @@ -165,6 +168,7 @@ enum { Opt_copyfrom, Opt_wsync, Opt_pagecache, + Opt_sparseread, }; enum ceph_recover_session_mode { @@ -192,6 +196,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_string ("fsc", Opt_fscache), // fsc=... fsparam_flag_no ("ino32", Opt_ino32), fsparam_string ("mds_namespace", Opt_mds_namespace), + fsparam_string ("mon_addr", Opt_mon_addr), fsparam_flag_no ("poolperm", Opt_poolperm), fsparam_flag_no ("quotadf", Opt_quotadf), fsparam_u32 ("rasize", Opt_rasize), @@ -203,10 +208,12 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_u32 ("rsize", Opt_rsize), fsparam_string ("snapdirname", Opt_snapdirname), fsparam_string ("source", Opt_source), - fsparam_string ("mon_addr", Opt_mon_addr), + fsparam_flag ("test_dummy_encryption", Opt_test_dummy_encryption), + fsparam_string ("test_dummy_encryption", Opt_test_dummy_encryption), fsparam_u32 ("wsize", Opt_wsize), fsparam_flag_no ("wsync", Opt_wsync), fsparam_flag_no ("pagecache", Opt_pagecache), + fsparam_flag_no ("sparseread", Opt_sparseread), {} }; @@ -576,6 +583,29 @@ static int ceph_parse_mount_param(struct fs_context *fc, else fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE; break; + case Opt_sparseread: + if (result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_SPARSEREAD; + else + fsopt->flags |= CEPH_MOUNT_OPT_SPARSEREAD; + break; + case Opt_test_dummy_encryption: +#ifdef CONFIG_FS_ENCRYPTION + fscrypt_free_dummy_policy(&fsopt->dummy_enc_policy); + ret = fscrypt_parse_test_dummy_encryption(param, + &fsopt->dummy_enc_policy); + if (ret == -EINVAL) { + warnfc(fc, "Value of option \"%s\" is unrecognized", + param->key); + } else if (ret == -EEXIST) { + warnfc(fc, "Conflicting test_dummy_encryption options"); + ret = -EINVAL; + } +#else + warnfc(fc, + "FS encryption not supported: test_dummy_encryption mount option ignored"); +#endif + break; default: BUG(); } @@ -596,6 +626,7 @@ static void destroy_mount_options(struct ceph_mount_options *args) kfree(args->server_path); kfree(args->fscache_uniq); kfree(args->mon_addr); + fscrypt_free_dummy_policy(&args->dummy_enc_policy); kfree(args); } @@ -710,9 +741,12 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)) seq_puts(m, ",wsync"); - if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE) seq_puts(m, ",nopagecache"); + if (fsopt->flags & CEPH_MOUNT_OPT_SPARSEREAD) + seq_puts(m, ",sparseread"); + + fscrypt_show_test_dummy_encryption(m, ',', root->d_sb); if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) seq_printf(m, ",wsize=%u", fsopt->wsize); @@ -1052,6 +1086,50 @@ out: return root; } +#ifdef CONFIG_FS_ENCRYPTION +static int ceph_apply_test_dummy_encryption(struct super_block *sb, + struct fs_context *fc, + struct ceph_mount_options *fsopt) +{ + struct ceph_fs_client *fsc = sb->s_fs_info; + + if (!fscrypt_is_dummy_policy_set(&fsopt->dummy_enc_policy)) + return 0; + + /* No changing encryption context on remount. */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE && + !fscrypt_is_dummy_policy_set(&fsc->fsc_dummy_enc_policy)) { + if (fscrypt_dummy_policies_equal(&fsopt->dummy_enc_policy, + &fsc->fsc_dummy_enc_policy)) + return 0; + errorfc(fc, "Can't set test_dummy_encryption on remount"); + return -EINVAL; + } + + /* Also make sure fsopt doesn't contain a conflicting value. */ + if (fscrypt_is_dummy_policy_set(&fsc->fsc_dummy_enc_policy)) { + if (fscrypt_dummy_policies_equal(&fsopt->dummy_enc_policy, + &fsc->fsc_dummy_enc_policy)) + return 0; + errorfc(fc, "Conflicting test_dummy_encryption options"); + return -EINVAL; + } + + fsc->fsc_dummy_enc_policy = fsopt->dummy_enc_policy; + memset(&fsopt->dummy_enc_policy, 0, sizeof(fsopt->dummy_enc_policy)); + + warnfc(fc, "test_dummy_encryption mode enabled"); + return 0; +} +#else +static int ceph_apply_test_dummy_encryption(struct super_block *sb, + struct fs_context *fc, + struct ceph_mount_options *fsopt) +{ + return 0; +} +#endif + /* * mount: join the ceph cluster, and open root directory. */ @@ -1080,6 +1158,11 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, goto out; } + err = ceph_apply_test_dummy_encryption(fsc->sb, fc, + fsc->mount_options); + if (err) + goto out; + dout("mount opening path '%s'\n", path); ceph_fs_debugfs_init(fsc); @@ -1101,6 +1184,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, out: mutex_unlock(&fsc->client->mount_mutex); + ceph_fscrypt_free_dummy_policy(fsc); return ERR_PTR(err); } @@ -1126,6 +1210,8 @@ static int ceph_set_super(struct super_block *s, struct fs_context *fc) s->s_time_max = U32_MAX; s->s_flags |= SB_NODIRATIME | SB_NOATIME; + ceph_fscrypt_set_ops(s); + ret = set_anon_super_fc(s, fc); if (ret != 0) fsc->sb = NULL; @@ -1287,15 +1373,26 @@ static void ceph_free_fc(struct fs_context *fc) static int ceph_reconfigure_fc(struct fs_context *fc) { + int err; struct ceph_parse_opts_ctx *pctx = fc->fs_private; struct ceph_mount_options *fsopt = pctx->opts; - struct ceph_fs_client *fsc = ceph_sb_to_client(fc->root->d_sb); + struct super_block *sb = fc->root->d_sb; + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + + err = ceph_apply_test_dummy_encryption(sb, fc, fsopt); + if (err) + return err; if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) ceph_set_mount_opt(fsc, ASYNC_DIROPS); else ceph_clear_mount_opt(fsc, ASYNC_DIROPS); + if (fsopt->flags & CEPH_MOUNT_OPT_SPARSEREAD) + ceph_set_mount_opt(fsc, SPARSEREAD); + else + ceph_clear_mount_opt(fsc, SPARSEREAD); + if (strcmp_null(fsc->mount_options->mon_addr, fsopt->mon_addr)) { kfree(fsc->mount_options->mon_addr); fsc->mount_options->mon_addr = fsopt->mon_addr; @@ -1303,7 +1400,7 @@ static int ceph_reconfigure_fc(struct fs_context *fc) pr_notice("ceph: monitor addresses recorded, but not used for reconnection"); } - sync_filesystem(fc->root->d_sb); + sync_filesystem(sb); return 0; } @@ -1365,25 +1462,101 @@ nomem: return -ENOMEM; } +/* + * Return true if it successfully increases the blocker counter, + * or false if the mdsc is in stopping and flushed state. + */ +static bool __inc_stopping_blocker(struct ceph_mds_client *mdsc) +{ + spin_lock(&mdsc->stopping_lock); + if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHING) { + spin_unlock(&mdsc->stopping_lock); + return false; + } + atomic_inc(&mdsc->stopping_blockers); + spin_unlock(&mdsc->stopping_lock); + return true; +} + +static void __dec_stopping_blocker(struct ceph_mds_client *mdsc) +{ + spin_lock(&mdsc->stopping_lock); + if (!atomic_dec_return(&mdsc->stopping_blockers) && + mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHING) + complete_all(&mdsc->stopping_waiter); + spin_unlock(&mdsc->stopping_lock); +} + +/* For metadata IO requests */ +bool ceph_inc_mds_stopping_blocker(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + mutex_lock(&session->s_mutex); + inc_session_sequence(session); + mutex_unlock(&session->s_mutex); + + return __inc_stopping_blocker(mdsc); +} + +void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc) +{ + __dec_stopping_blocker(mdsc); +} + +/* For data IO requests */ +bool ceph_inc_osd_stopping_blocker(struct ceph_mds_client *mdsc) +{ + return __inc_stopping_blocker(mdsc); +} + +void ceph_dec_osd_stopping_blocker(struct ceph_mds_client *mdsc) +{ + __dec_stopping_blocker(mdsc); +} + static void ceph_kill_sb(struct super_block *s) { struct ceph_fs_client *fsc = ceph_sb_to_client(s); + struct ceph_mds_client *mdsc = fsc->mdsc; + bool wait; dout("kill_sb %p\n", s); - ceph_mdsc_pre_umount(fsc->mdsc); + ceph_mdsc_pre_umount(mdsc); flush_fs_workqueues(fsc); /* * Though the kill_anon_super() will finally trigger the - * sync_filesystem() anyway, we still need to do it here - * and then bump the stage of shutdown to stop the work - * queue as earlier as possible. + * sync_filesystem() anyway, we still need to do it here and + * then bump the stage of shutdown. This will allow us to + * drop any further message, which will increase the inodes' + * i_count reference counters but makes no sense any more, + * from MDSs. + * + * Without this when evicting the inodes it may fail in the + * kill_anon_super(), which will trigger a warning when + * destroying the fscrypt keyring and then possibly trigger + * a further crash in ceph module when the iput() tries to + * evict the inodes later. */ sync_filesystem(s); - fsc->mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED; + spin_lock(&mdsc->stopping_lock); + mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHING; + wait = !!atomic_read(&mdsc->stopping_blockers); + spin_unlock(&mdsc->stopping_lock); + + if (wait && atomic_read(&mdsc->stopping_blockers)) { + long timeleft = wait_for_completion_killable_timeout( + &mdsc->stopping_waiter, + fsc->client->options->mount_timeout); + if (!timeleft) /* timed out */ + pr_warn("umount timed out, %ld\n", timeleft); + else if (timeleft < 0) /* killed */ + pr_warn("umount was killed, %ld\n", timeleft); + } + mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED; kill_anon_super(s); fsc->client->extra_mon_dispatch = NULL; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 3bfddf34d488..51c7f2b14f6f 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -22,6 +22,7 @@ #include <linux/hashtable.h> #include <linux/ceph/libceph.h> +#include "crypto.h" /* large granularity for statfs utilization stats to facilitate * large volume sizes on 32-bit machines. */ @@ -42,6 +43,7 @@ #define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ #define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */ #define CEPH_MOUNT_OPT_NOPAGECACHE (1<<16) /* bypass pagecache altogether */ +#define CEPH_MOUNT_OPT_SPARSEREAD (1<<17) /* always do sparse reads */ #define CEPH_MOUNT_OPT_DEFAULT \ (CEPH_MOUNT_OPT_DCACHE | \ @@ -98,6 +100,7 @@ struct ceph_mount_options { char *server_path; /* default NULL (means "/") */ char *fscache_uniq; /* default NULL */ char *mon_addr; + struct fscrypt_dummy_policy dummy_enc_policy; }; /* mount state */ @@ -154,9 +157,11 @@ struct ceph_fs_client { #ifdef CONFIG_CEPH_FSCACHE struct fscache_volume *fscache; #endif +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_dummy_policy fsc_dummy_enc_policy; +#endif }; - /* * File i/o capability. This tracks shared state with the metadata * server that allows us to cache or writeback attributes or to read @@ -419,6 +424,11 @@ struct ceph_inode_info { u32 i_truncate_seq; /* last truncate to smaller size */ u64 i_truncate_size; /* and the size we last truncated down to */ int i_truncate_pending; /* still need to call vmtruncate */ + /* + * For none fscrypt case it equals to i_truncate_size or it will + * equals to fscrypt_file_size + */ + u64 i_truncate_pagecache_size; u64 i_max_size; /* max file size authorized by mds */ u64 i_reported_size; /* (max_)size reported to or requested of mds */ @@ -449,6 +459,13 @@ struct ceph_inode_info { struct work_struct i_work; unsigned long i_work_mask; + +#ifdef CONFIG_FS_ENCRYPTION + u32 fscrypt_auth_len; + u32 fscrypt_file_len; + u8 *fscrypt_auth; + u8 *fscrypt_file; +#endif }; struct ceph_netfs_request_data { @@ -998,6 +1015,7 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) /* inode.c */ struct ceph_mds_reply_info_in; struct ceph_mds_reply_dirfrag; +struct ceph_acl_sec_ctx; extern const struct inode_operations ceph_file_iops; @@ -1005,8 +1023,14 @@ extern struct inode *ceph_alloc_inode(struct super_block *sb); extern void ceph_evict_inode(struct inode *inode); extern void ceph_free_inode(struct inode *inode); +struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry, + umode_t *mode, struct ceph_acl_sec_ctx *as_ctx); +void ceph_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as_ctx); + extern struct inode *ceph_get_inode(struct super_block *sb, - struct ceph_vino vino); + struct ceph_vino vino, + struct inode *newino); extern struct inode *ceph_get_snapdir(struct inode *parent); extern int ceph_fill_file_size(struct inode *inode, int issued, u32 truncate_seq, u64 truncate_size, u64 size); @@ -1065,7 +1089,13 @@ static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) } extern int ceph_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); -extern int __ceph_setattr(struct inode *inode, struct iattr *attr); + +struct ceph_iattr { + struct ceph_fscrypt_auth *fscrypt_auth; +}; + +extern int __ceph_setattr(struct inode *inode, struct iattr *attr, + struct ceph_iattr *cia); extern int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern int ceph_getattr(struct mnt_idmap *idmap, @@ -1100,6 +1130,9 @@ struct ceph_acl_sec_ctx { void *sec_ctx; u32 sec_ctxlen; #endif +#ifdef CONFIG_FS_ENCRYPTION + struct ceph_fscrypt_auth *fscrypt_auth; +#endif struct ceph_pagelist *pagelist; }; @@ -1237,6 +1270,8 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn, struct inode *dir, int mds, int drop, int unless); +extern int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, + int need, int want, loff_t endoff, int *got); extern int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got); extern int ceph_try_get_caps(struct inode *inode, @@ -1272,6 +1307,9 @@ extern int ceph_renew_caps(struct inode *inode, int fmode); extern int ceph_open(struct inode *inode, struct file *file); extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t mode); +extern ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, + struct iov_iter *to, int *retry_op, + u64 *last_objver); extern int ceph_release(struct inode *inode, struct file *filp); extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, char *data, size_t len); @@ -1375,4 +1413,9 @@ extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf); extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc); +bool ceph_inc_mds_stopping_blocker(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); +void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc); +bool ceph_inc_osd_stopping_blocker(struct ceph_mds_client *mdsc); +void ceph_dec_osd_stopping_blocker(struct ceph_mds_client *mdsc); #endif /* _FS_CEPH_SUPER_H */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 806183959c47..0deae4a0f5f1 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -352,6 +352,24 @@ static ssize_t ceph_vxattrcb_auth_mds(struct ceph_inode_info *ci, return ret; } +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +static bool ceph_vxattrcb_fscrypt_auth_exists(struct ceph_inode_info *ci) +{ + return ci->fscrypt_auth_len; +} + +static ssize_t ceph_vxattrcb_fscrypt_auth(struct ceph_inode_info *ci, + char *val, size_t size) +{ + if (size) { + if (size < ci->fscrypt_auth_len) + return -ERANGE; + memcpy(val, ci->fscrypt_auth, ci->fscrypt_auth_len); + } + return ci->fscrypt_auth_len; +} +#endif /* CONFIG_FS_ENCRYPTION */ + #define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name #define CEPH_XATTR_NAME2(_type, _name, _name2) \ XATTR_CEPH_PREFIX #_type "." #_name "." #_name2 @@ -500,6 +518,15 @@ static struct ceph_vxattr ceph_common_vxattrs[] = { .exists_cb = NULL, .flags = VXATTR_FLAG_READONLY, }, +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + { + .name = "ceph.fscrypt.auth", + .name_size = sizeof("ceph.fscrypt.auth"), + .getxattr_cb = ceph_vxattrcb_fscrypt_auth, + .exists_cb = ceph_vxattrcb_fscrypt_auth_exists, + .flags = VXATTR_FLAG_READONLY, + }, +#endif /* CONFIG_FS_ENCRYPTION */ { .name = NULL, 0 } /* Required table terminator */ }; @@ -1238,7 +1265,7 @@ retry: dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, &prealloc_cf); ci->i_xattrs.dirty = true; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); } spin_unlock(&ci->i_ceph_lock); @@ -1408,6 +1435,9 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx) #ifdef CONFIG_CEPH_FS_SECURITY_LABEL security_release_secctx(as_ctx->sec_ctx, as_ctx->sec_ctxlen); #endif +#ifdef CONFIG_FS_ENCRYPTION + kfree(as_ctx->fscrypt_auth); +#endif if (as_ctx->pagelist) ceph_pagelist_release(as_ctx->pagelist); } diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index 903ca8fa4b9b..ae023853a98f 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -127,7 +127,8 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr) if (attr->va_mtime.tv_sec != -1) inode->i_mtime = coda_to_timespec64(attr->va_mtime); if (attr->va_ctime.tv_sec != -1) - inode->i_ctime = coda_to_timespec64(attr->va_ctime); + inode_set_ctime_to_ts(inode, + coda_to_timespec64(attr->va_ctime)); } diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 1b960de2bf39..cb512b10473b 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -111,7 +111,7 @@ static inline void coda_dir_update_mtime(struct inode *dir) /* optimistically we can also act as if our nose bleeds. The * granularity of the mtime is coarse anyways so we might actually be * right most of the time. Note: we only do this for directories. */ - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); #endif } diff --git a/fs/coda/file.c b/fs/coda/file.c index 12b26bd13564..42346618b4ed 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -84,7 +84,7 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0); coda_inode->i_size = file_inode(host_file)->i_size; coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; - coda_inode->i_mtime = coda_inode->i_ctime = current_time(coda_inode); + coda_inode->i_mtime = inode_set_ctime_current(coda_inode); inode_unlock(coda_inode); file_end_write(host_file); diff --git a/fs/coda/inode.c b/fs/coda/inode.c index d661e6cf17ac..0c7c2528791e 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -256,7 +256,8 @@ int coda_getattr(struct mnt_idmap *idmap, const struct path *path, { int err = coda_revalidate_inode(d_inode(path->dentry)); if (!err) - generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, + d_inode(path->dentry), stat); return err; } @@ -269,7 +270,7 @@ int coda_setattr(struct mnt_idmap *idmap, struct dentry *de, memset(&vattr, 0, sizeof(vattr)); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); coda_iattr_to_vattr(iattr, &vattr); vattr.va_type = C_VNON; /* cannot set type */ diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 1c15edbe70ff..fbdcb3582926 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -88,8 +88,7 @@ int configfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, static inline void set_default_inode_attr(struct inode * inode, umode_t mode) { inode->i_mode = mode; - inode->i_atime = inode->i_mtime = - inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); } static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) @@ -99,7 +98,7 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) inode->i_gid = iattr->ia_gid; inode->i_atime = iattr->ia_atime; inode->i_mtime = iattr->ia_mtime; - inode->i_ctime = iattr->ia_ctime; + inode_set_ctime_to_ts(inode, iattr->ia_ctime); } struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd, @@ -172,7 +171,7 @@ struct inode *configfs_create(struct dentry *dentry, umode_t mode) return ERR_PTR(-ENOMEM); p_inode = d_inode(dentry->d_parent); - p_inode->i_mtime = p_inode->i_ctime = current_time(p_inode); + p_inode->i_mtime = inode_set_ctime_current(p_inode); configfs_set_inode_lock_class(sd, inode); return inode; } diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 27c6597aa1be..5ee7d7bbb361 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -133,7 +133,8 @@ static struct inode *get_cramfs_inode(struct super_block *sb, } /* Struct copy intentional */ - inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime; + inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode, + zerotime); /* inode->i_nlink is left 1 - arguably wrong for directories, but it's the best we can do without reading the directory contents. 1 yields the right result in GNU find, even @@ -485,12 +486,16 @@ static void cramfs_kill_sb(struct super_block *sb) { struct cramfs_sb_info *sbi = CRAMFS_SB(sb); + generic_shutdown_super(sb); + if (IS_ENABLED(CONFIG_CRAMFS_MTD) && sb->s_mtd) { if (sbi && sbi->mtd_point_size) mtd_unpoint(sb->s_mtd, 0, sbi->mtd_point_size); - kill_mtd_super(sb); + put_mtd_device(sb->s_mtd); + sb->s_mtd = NULL; } else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) { - kill_block_super(sb); + sync_blockdev(sb->s_bdev); + blkdev_put(sb->s_bdev, sb); } kfree(sbi); } @@ -30,17 +30,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/fs_dax.h> -static inline unsigned int pe_order(enum page_entry_size pe_size) -{ - if (pe_size == PE_SIZE_PTE) - return PAGE_SHIFT - PAGE_SHIFT; - if (pe_size == PE_SIZE_PMD) - return PMD_SHIFT - PAGE_SHIFT; - if (pe_size == PE_SIZE_PUD) - return PUD_SHIFT - PAGE_SHIFT; - return ~0; -} - /* We choose 4096 entries - same as per-zone page wait tables */ #define DAX_WAIT_TABLE_BITS 12 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) @@ -49,9 +38,6 @@ static inline unsigned int pe_order(enum page_entry_size pe_size) #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT) -/* The order of a PMD entry */ -#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) - static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; static int __init init_dax_wait_table(void) @@ -1908,7 +1894,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, /** * dax_iomap_fault - handle a page fault on a DAX file * @vmf: The description of the fault - * @pe_size: Size of the page to fault in + * @order: Order of the page to fault in * @pfnp: PFN to insert for synchronous faults if fsync is required * @iomap_errp: Storage for detailed error code in case of error * @ops: Iomap ops passed from the file system @@ -1918,17 +1904,15 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * has done all the necessary locking for page fault to proceed * successfully. */ -vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, +vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { - switch (pe_size) { - case PE_SIZE_PTE: + if (order == 0) return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops); - case PE_SIZE_PMD: + else if (order == PMD_ORDER) return dax_iomap_pmd_fault(vmf, pfnp, ops); - default: + else return VM_FAULT_FALLBACK; - } } EXPORT_SYMBOL_GPL(dax_iomap_fault); @@ -1979,19 +1963,18 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) /** * dax_finish_sync_fault - finish synchronous page fault * @vmf: The description of the fault - * @pe_size: Size of entry to be inserted + * @order: Order of entry to be inserted * @pfn: PFN to insert * * This function ensures that the file range touched by the page fault is * stored persistently on the media and handles inserting of appropriate page * table entry. */ -vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, - enum page_entry_size pe_size, pfn_t pfn) +vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order, + pfn_t pfn) { int err; loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; - unsigned int order = pe_order(pe_size); size_t len = PAGE_SIZE << order; err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); diff --git a/fs/dcache.c b/fs/dcache.c index 52e6d5fdab6b..25ac74d30bff 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1664,7 +1664,7 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) if (dentry == _data && dentry->d_lockref.count == 1) return D_WALK_CONTINUE; - printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} " + WARN(1, "BUG: Dentry %p{i=%lx,n=%pd} " " still in use (%d) [unmount of %s %s]\n", dentry, dentry->d_inode ? @@ -1673,7 +1673,6 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) dentry->d_lockref.count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); - WARN_ON(1); return D_WALK_CONTINUE; } @@ -3247,8 +3246,6 @@ void d_genocide(struct dentry *parent) d_walk(parent, parent, d_genocide_kill); } -EXPORT_SYMBOL(d_genocide); - void d_tmpfile(struct file *file, struct inode *inode) { struct dentry *dentry = file->f_path.dentry; diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index b7711888dd17..87b3753aa4b1 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -904,8 +904,52 @@ EXPORT_SYMBOL_GPL(debugfs_create_str); static ssize_t debugfs_write_file_str(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { - /* This is really only for read-only strings */ - return -EINVAL; + struct dentry *dentry = F_DENTRY(file); + char *old, *new = NULL; + int pos = *ppos; + int r; + + r = debugfs_file_get(dentry); + if (unlikely(r)) + return r; + + old = *(char **)file->private_data; + + /* only allow strict concatenation */ + r = -EINVAL; + if (pos && pos != strlen(old)) + goto error; + + r = -E2BIG; + if (pos + count + 1 > PAGE_SIZE) + goto error; + + r = -ENOMEM; + new = kmalloc(pos + count + 1, GFP_KERNEL); + if (!new) + goto error; + + if (pos) + memcpy(new, old, pos); + + r = -EFAULT; + if (copy_from_user(new + pos, user_buf, count)) + goto error; + + new[pos + count] = '\0'; + strim(new); + + rcu_assign_pointer(*(char **)file->private_data, new); + synchronize_rcu(); + kfree(old); + + debugfs_file_put(dentry); + return count; + +error: + kfree(new); + debugfs_file_put(dentry); + return r; } static const struct file_operations fops_str = { diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 3f81f73c241a..83e57e9f9fa0 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -72,8 +72,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb) struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); - inode->i_atime = inode->i_mtime = - inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); } return inode; } diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index fe3db0eda8e4..299c295a27a0 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -338,7 +338,7 @@ static int mknod_ptmx(struct super_block *sb) } inode->i_ino = 2; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); mode = S_IFCHR|opts->ptmxmode; init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2)); @@ -451,7 +451,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) if (!inode) goto fail; inode->i_ino = 1; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; @@ -534,12 +534,12 @@ void devpts_kill_index(struct pts_fs_info *fsi, int idx) /** * devpts_pty_new -- create a new inode in /dev/pts/ - * @ptmx_inode: inode of the master - * @device: major+minor of the node to be created + * @fsi: Filesystem info for this instance. * @index: used as a name of the node * @priv: what's given back by devpts_get_priv * - * The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill. + * The dentry for the created inode is returned. + * Remove it from /dev/pts/ with devpts_pty_kill(). */ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) { @@ -560,7 +560,7 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) inode->i_ino = index + 3; inode->i_uid = opts->setuid ? opts->uid : current_fsuid(); inode->i_gid = opts->setgid ? opts->gid : current_fsgid(); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); init_special_inode(inode, S_IFCHR|opts->mode, MKDEV(UNIX98_PTY_SLAVE_MAJOR, index)); sprintf(s, "%d", index); @@ -580,7 +580,7 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) /** * devpts_get_priv -- get private data for a slave - * @pts_inode: inode of the slave + * @dentry: dentry of the slave * * Returns whatever was passed as priv in devpts_pty_new for a given inode. */ @@ -593,7 +593,7 @@ void *devpts_get_priv(struct dentry *dentry) /** * devpts_pty_kill -- remove inode form /dev/pts/ - * @inode: inode of the slave to be removed + * @dentry: dentry of the slave to be removed * * This is an inverse operation of devpts_pty_new. */ diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 2beceff024e3..e55e0a2cd2e8 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -664,7 +664,7 @@ static ssize_t comm_addr_store(struct config_item *item, const char *buf, memcpy(addr, buf, len); - rv = dlm_lowcomms_addr(cm->nodeid, addr, len); + rv = dlm_midcomms_addr(cm->nodeid, addr, len); if (rv) { kfree(addr); return rv; diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index a1aca41c49d0..5aabcb6f0f15 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -18,6 +18,7 @@ #include "dlm_internal.h" #include "midcomms.h" #include "lock.h" +#include "ast.h" #define DLM_DEBUG_BUF_LEN 4096 static char debug_buf[DLM_DEBUG_BUF_LEN]; @@ -365,6 +366,52 @@ static void print_format4(struct dlm_rsb *r, struct seq_file *s) unlock_rsb(r); } +static void print_format5_lock(struct seq_file *s, struct dlm_lkb *lkb) +{ + struct dlm_callback *cb; + + /* lkb_id lkb_flags mode flags sb_status sb_flags */ + + spin_lock(&lkb->lkb_cb_lock); + list_for_each_entry(cb, &lkb->lkb_callbacks, list) { + seq_printf(s, "%x %x %d %x %d %x\n", + lkb->lkb_id, + dlm_iflags_val(lkb), + cb->mode, + cb->flags, + cb->sb_status, + cb->sb_flags); + } + spin_unlock(&lkb->lkb_cb_lock); +} + +static void print_format5(struct dlm_rsb *r, struct seq_file *s) +{ + struct dlm_lkb *lkb; + + lock_rsb(r); + + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { + print_format5_lock(s, lkb); + if (seq_has_overflowed(s)) + goto out; + } + + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) { + print_format5_lock(s, lkb); + if (seq_has_overflowed(s)) + goto out; + } + + list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) { + print_format5_lock(s, lkb); + if (seq_has_overflowed(s)) + goto out; + } + out: + unlock_rsb(r); +} + struct rsbtbl_iter { struct dlm_rsb *rsb; unsigned bucket; @@ -408,6 +455,13 @@ static int table_seq_show(struct seq_file *seq, void *iter_ptr) } print_format4(ri->rsb, seq); break; + case 5: + if (ri->header) { + seq_puts(seq, "lkb_id lkb_flags mode flags sb_status sb_flags\n"); + ri->header = 0; + } + print_format5(ri->rsb, seq); + break; } return 0; @@ -417,6 +471,7 @@ static const struct seq_operations format1_seq_ops; static const struct seq_operations format2_seq_ops; static const struct seq_operations format3_seq_ops; static const struct seq_operations format4_seq_ops; +static const struct seq_operations format5_seq_ops; static void *table_seq_start(struct seq_file *seq, loff_t *pos) { @@ -448,6 +503,8 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos) ri->format = 3; if (seq->op == &format4_seq_ops) ri->format = 4; + if (seq->op == &format5_seq_ops) + ri->format = 5; tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep; @@ -602,10 +659,18 @@ static const struct seq_operations format4_seq_ops = { .show = table_seq_show, }; +static const struct seq_operations format5_seq_ops = { + .start = table_seq_start, + .next = table_seq_next, + .stop = table_seq_stop, + .show = table_seq_show, +}; + static const struct file_operations format1_fops; static const struct file_operations format2_fops; static const struct file_operations format3_fops; static const struct file_operations format4_fops; +static const struct file_operations format5_fops; static int table_open1(struct inode *inode, struct file *file) { @@ -683,7 +748,21 @@ static int table_open4(struct inode *inode, struct file *file) struct seq_file *seq; int ret; - ret = seq_open(file, &format4_seq_ops); + ret = seq_open(file, &format5_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; /* the dlm_ls */ + return 0; +} + +static int table_open5(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &format5_seq_ops); if (ret) return ret; @@ -725,6 +804,14 @@ static const struct file_operations format4_fops = { .release = seq_release }; +static const struct file_operations format5_fops = { + .owner = THIS_MODULE, + .open = table_open5, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + /* * dump lkb's on the ls_waiters list */ @@ -793,6 +880,7 @@ void dlm_delete_debug_file(struct dlm_ls *ls) debugfs_remove(ls->ls_debug_locks_dentry); debugfs_remove(ls->ls_debug_all_dentry); debugfs_remove(ls->ls_debug_toss_dentry); + debugfs_remove(ls->ls_debug_queued_asts_dentry); } static int dlm_state_show(struct seq_file *file, void *offset) @@ -936,6 +1024,17 @@ void dlm_create_debug_file(struct dlm_ls *ls) dlm_root, ls, &waiters_fops); + + /* format 5 */ + + memset(name, 0, sizeof(name)); + snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_queued_asts", ls->ls_name); + + ls->ls_debug_queued_asts_dentry = debugfs_create_file(name, + 0644, + dlm_root, + ls, + &format5_fops); } void __init dlm_register_debugfs(void) diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index fb1981654bb2..f6acba4310a7 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c @@ -58,7 +58,7 @@ void dlm_recover_dir_nodeid(struct dlm_ls *ls) up_read(&ls->ls_root_sem); } -int dlm_recover_directory(struct dlm_ls *ls) +int dlm_recover_directory(struct dlm_ls *ls, uint64_t seq) { struct dlm_member *memb; char *b, *last_name = NULL; @@ -90,7 +90,7 @@ int dlm_recover_directory(struct dlm_ls *ls) } error = dlm_rcom_names(ls, memb->nodeid, - last_name, last_len); + last_name, last_len, seq); if (error) goto out_free; @@ -196,7 +196,8 @@ int dlm_recover_directory(struct dlm_ls *ls) return error; } -static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len) +static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, const char *name, + int len) { struct dlm_rsb *r; uint32_t hash, bucket; @@ -232,7 +233,7 @@ static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len) for rsb's we're master of and whose directory node matches the requesting node. inbuf is the rsb name last sent, inlen is the name's length */ -void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, +void dlm_copy_master_names(struct dlm_ls *ls, const char *inbuf, int inlen, char *outbuf, int outlen, int nodeid) { struct list_head *list; @@ -245,9 +246,8 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, if (inlen > 1) { r = find_rsb_root(ls, inbuf, inlen); if (!r) { - inbuf[inlen - 1] = '\0'; - log_error(ls, "copy_master_names from %d start %d %s", - nodeid, inlen, inbuf); + log_error(ls, "copy_master_names from %d start %d %.*s", + nodeid, inlen, inlen, inbuf); goto out; } list = r->res_root_list.next; diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h index 03844d086be2..39ecb69d7ef3 100644 --- a/fs/dlm/dir.h +++ b/fs/dlm/dir.h @@ -15,9 +15,9 @@ int dlm_dir_nodeid(struct dlm_rsb *rsb); int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash); void dlm_recover_dir_nodeid(struct dlm_ls *ls); -int dlm_recover_directory(struct dlm_ls *ls); -void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, - char *outbuf, int outlen, int nodeid); +int dlm_recover_directory(struct dlm_ls *ls, uint64_t seq); +void dlm_copy_master_names(struct dlm_ls *ls, const char *inbuf, int inlen, + char *outbuf, int outlen, int nodeid); #endif /* __DIR_DOT_H__ */ diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index c8156770205e..dfc444dad329 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -598,6 +598,7 @@ struct dlm_ls { struct dentry *ls_debug_locks_dentry; /* debugfs */ struct dentry *ls_debug_all_dentry; /* debugfs */ struct dentry *ls_debug_toss_dentry; /* debugfs */ + struct dentry *ls_debug_queued_asts_dentry; /* debugfs */ wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ int ls_uevent_result; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index f511a9d7d416..652c51fbbf76 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -86,8 +86,8 @@ static int send_remove(struct dlm_rsb *r); static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, - struct dlm_message *ms, bool local); -static int receive_extralen(struct dlm_message *ms); + const struct dlm_message *ms, bool local); +static int receive_extralen(const struct dlm_message *ms); static void do_purge(struct dlm_ls *ls, int nodeid, int pid); static void toss_rsb(struct kref *kref); @@ -984,8 +984,8 @@ static void __dlm_master_lookup(struct dlm_ls *ls, struct dlm_rsb *r, int our_no * . dlm_master_lookup RECOVER_MASTER (fix_master 1, from_master 0) */ -int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len, - unsigned int flags, int *r_nodeid, int *result) +int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name, + int len, unsigned int flags, int *r_nodeid, int *result) { struct dlm_rsb *r = NULL; uint32_t hash, b; @@ -1106,7 +1106,7 @@ static void dlm_dump_rsb_hash(struct dlm_ls *ls, uint32_t hash) } } -void dlm_dump_rsb_name(struct dlm_ls *ls, char *name, int len) +void dlm_dump_rsb_name(struct dlm_ls *ls, const char *name, int len) { struct dlm_rsb *r = NULL; uint32_t hash, b; @@ -1459,7 +1459,7 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) set RESEND and dlm_recover_waiters_post() */ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype, - struct dlm_message *ms) + const struct dlm_message *ms) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; int overlap_done = 0; @@ -1557,8 +1557,8 @@ static int remove_from_waiters(struct dlm_lkb *lkb, int mstype) /* Handles situations where we might be processing a "fake" or "local" reply in which we can't try to take waiters_mutex again. */ -static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms, - bool local) +static int remove_from_waiters_ms(struct dlm_lkb *lkb, + const struct dlm_message *ms, bool local) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; int error; @@ -1800,7 +1800,7 @@ static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) /* lkb is process copy (pc) */ static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, - struct dlm_message *ms) + const struct dlm_message *ms) { int b; @@ -1907,7 +1907,7 @@ static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) } static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, - struct dlm_message *ms) + const struct dlm_message *ms) { set_lvb_lock_pc(r, lkb, ms); _grant_lock(r, lkb); @@ -1945,7 +1945,7 @@ static void munge_demoted(struct dlm_lkb *lkb) lkb->lkb_grmode = DLM_LOCK_NL; } -static void munge_altmode(struct dlm_lkb *lkb, struct dlm_message *ms) +static void munge_altmode(struct dlm_lkb *lkb, const struct dlm_message *ms) { if (ms->m_type != cpu_to_le32(DLM_MSG_REQUEST_REPLY) && ms->m_type != cpu_to_le32(DLM_MSG_GRANT)) { @@ -3641,8 +3641,9 @@ static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv); } -static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in, - int ret_nodeid, int rv) +static int send_lookup_reply(struct dlm_ls *ls, + const struct dlm_message *ms_in, int ret_nodeid, + int rv) { struct dlm_rsb *r = &ls->ls_local_rsb; struct dlm_message *ms; @@ -3667,14 +3668,15 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in, of message, unlike the send side where we can safely send everything about the lkb for any type of message */ -static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms) +static void receive_flags(struct dlm_lkb *lkb, const struct dlm_message *ms) { lkb->lkb_exflags = le32_to_cpu(ms->m_exflags); dlm_set_sbflags_val(lkb, le32_to_cpu(ms->m_sbflags)); dlm_set_dflags_val(lkb, le32_to_cpu(ms->m_flags)); } -static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms, +static void receive_flags_reply(struct dlm_lkb *lkb, + const struct dlm_message *ms, bool local) { if (local) @@ -3684,14 +3686,14 @@ static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms, dlm_set_dflags_val(lkb, le32_to_cpu(ms->m_flags)); } -static int receive_extralen(struct dlm_message *ms) +static int receive_extralen(const struct dlm_message *ms) { return (le16_to_cpu(ms->m_header.h_length) - sizeof(struct dlm_message)); } static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb, - struct dlm_message *ms) + const struct dlm_message *ms) { int len; @@ -3719,7 +3721,7 @@ static void fake_astfn(void *astparam) } static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb, - struct dlm_message *ms) + const struct dlm_message *ms) { lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid); lkb->lkb_ownpid = le32_to_cpu(ms->m_pid); @@ -3741,7 +3743,7 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb, } static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb, - struct dlm_message *ms) + const struct dlm_message *ms) { if (lkb->lkb_status != DLM_LKSTS_GRANTED) return -EBUSY; @@ -3756,7 +3758,7 @@ static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb, } static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, - struct dlm_message *ms) + const struct dlm_message *ms) { if (receive_lvb(ls, lkb, ms)) return -ENOMEM; @@ -3766,7 +3768,7 @@ static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, /* We fill in the local-lkb fields with the info that send_xxxx_reply() uses to send a reply and that the remote end uses to process the reply. */ -static void setup_local_lkb(struct dlm_ls *ls, struct dlm_message *ms) +static void setup_local_lkb(struct dlm_ls *ls, const struct dlm_message *ms) { struct dlm_lkb *lkb = &ls->ls_local_lkb; lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid); @@ -3776,7 +3778,7 @@ static void setup_local_lkb(struct dlm_ls *ls, struct dlm_message *ms) /* This is called after the rsb is locked so that we can safely inspect fields in the lkb. */ -static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) +static int validate_message(struct dlm_lkb *lkb, const struct dlm_message *ms) { int from = le32_to_cpu(ms->m_header.h_nodeid); int error = 0; @@ -3828,7 +3830,7 @@ out: return error; } -static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_request(struct dlm_ls *ls, const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -3907,7 +3909,7 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) return error; } -static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_convert(struct dlm_ls *ls, const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -3963,7 +3965,7 @@ static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms) return error; } -static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_unlock(struct dlm_ls *ls, const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -4015,7 +4017,7 @@ static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) return error; } -static int receive_cancel(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_cancel(struct dlm_ls *ls, const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -4051,7 +4053,7 @@ static int receive_cancel(struct dlm_ls *ls, struct dlm_message *ms) return error; } -static int receive_grant(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_grant(struct dlm_ls *ls, const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -4082,7 +4084,7 @@ static int receive_grant(struct dlm_ls *ls, struct dlm_message *ms) return 0; } -static int receive_bast(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_bast(struct dlm_ls *ls, const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -4110,7 +4112,7 @@ static int receive_bast(struct dlm_ls *ls, struct dlm_message *ms) return 0; } -static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms) +static void receive_lookup(struct dlm_ls *ls, const struct dlm_message *ms) { int len, error, ret_nodeid, from_nodeid, our_nodeid; @@ -4130,7 +4132,7 @@ static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms) send_lookup_reply(ls, ms, ret_nodeid, error); } -static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms) +static void receive_remove(struct dlm_ls *ls, const struct dlm_message *ms) { char name[DLM_RESNAME_MAXLEN+1]; struct dlm_rsb *r; @@ -4218,12 +4220,13 @@ static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms) } } -static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms) +static void receive_purge(struct dlm_ls *ls, const struct dlm_message *ms) { do_purge(ls, le32_to_cpu(ms->m_nodeid), le32_to_cpu(ms->m_pid)); } -static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_request_reply(struct dlm_ls *ls, + const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -4345,7 +4348,7 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) } static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, - struct dlm_message *ms, bool local) + const struct dlm_message *ms, bool local) { /* this is the value returned from do_convert() on the master */ switch (from_dlm_errno(le32_to_cpu(ms->m_result))) { @@ -4388,8 +4391,8 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, } } -static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms, - bool local) +static void _receive_convert_reply(struct dlm_lkb *lkb, + const struct dlm_message *ms, bool local) { struct dlm_rsb *r = lkb->lkb_resource; int error; @@ -4412,7 +4415,8 @@ static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms, put_rsb(r); } -static int receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_convert_reply(struct dlm_ls *ls, + const struct dlm_message *ms) { struct dlm_lkb *lkb; int error; @@ -4426,8 +4430,8 @@ static int receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms) return 0; } -static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms, - bool local) +static void _receive_unlock_reply(struct dlm_lkb *lkb, + const struct dlm_message *ms, bool local) { struct dlm_rsb *r = lkb->lkb_resource; int error; @@ -4463,7 +4467,8 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms, put_rsb(r); } -static int receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_unlock_reply(struct dlm_ls *ls, + const struct dlm_message *ms) { struct dlm_lkb *lkb; int error; @@ -4477,8 +4482,8 @@ static int receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms) return 0; } -static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms, - bool local) +static void _receive_cancel_reply(struct dlm_lkb *lkb, + const struct dlm_message *ms, bool local) { struct dlm_rsb *r = lkb->lkb_resource; int error; @@ -4515,7 +4520,8 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms, put_rsb(r); } -static int receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_cancel_reply(struct dlm_ls *ls, + const struct dlm_message *ms) { struct dlm_lkb *lkb; int error; @@ -4529,7 +4535,8 @@ static int receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms) return 0; } -static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) +static void receive_lookup_reply(struct dlm_ls *ls, + const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -4608,7 +4615,7 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) dlm_put_lkb(lkb); } -static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms, +static void _receive_message(struct dlm_ls *ls, const struct dlm_message *ms, uint32_t saved_seq) { int error = 0, noent = 0; @@ -4744,7 +4751,7 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms, requestqueue, to processing all the saved messages, to processing new messages as they arrive. */ -static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms, +static void dlm_receive_message(struct dlm_ls *ls, const struct dlm_message *ms, int nodeid) { if (dlm_locking_stopped(ls)) { @@ -4767,7 +4774,7 @@ static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms, /* This is called by dlm_recoverd to process messages that were saved on the requestqueue. */ -void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms, +void dlm_receive_message_saved(struct dlm_ls *ls, const struct dlm_message *ms, uint32_t saved_seq) { _receive_message(ls, ms, saved_seq); @@ -4778,9 +4785,9 @@ void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms, standard locking activity) or an RCOM (recovery message sent as part of lockspace recovery). */ -void dlm_receive_buffer(union dlm_packet *p, int nodeid) +void dlm_receive_buffer(const union dlm_packet *p, int nodeid) { - struct dlm_header *hd = &p->header; + const struct dlm_header *hd = &p->header; struct dlm_ls *ls; int type = 0; @@ -5334,7 +5341,7 @@ static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid, /* needs at least dlm_rcom + rcom_lock */ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, - struct dlm_rsb *r, struct dlm_rcom *rc) + struct dlm_rsb *r, const struct dlm_rcom *rc) { struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; @@ -5384,7 +5391,8 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, back the rcom_lock struct we got but with the remid field filled in. */ /* needs at least dlm_rcom + rcom_lock */ -int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) +int dlm_recover_master_copy(struct dlm_ls *ls, const struct dlm_rcom *rc, + __le32 *rl_remid, __le32 *rl_result) { struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; struct dlm_rsb *r; @@ -5393,6 +5401,9 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) int from_nodeid = le32_to_cpu(rc->rc_header.h_nodeid); int error; + /* init rl_remid with rcom lock rl_remid */ + *rl_remid = rl->rl_remid; + if (rl->rl_parent_lkid) { error = -EOPNOTSUPP; goto out; @@ -5448,7 +5459,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) out_remid: /* this is the new value returned to the lock holder for saving in its process-copy lkb */ - rl->rl_remid = cpu_to_le32(lkb->lkb_id); + *rl_remid = cpu_to_le32(lkb->lkb_id); lkb->lkb_recover_seq = ls->ls_recover_seq; @@ -5459,12 +5470,13 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) if (error && error != -EEXIST) log_rinfo(ls, "dlm_recover_master_copy remote %d %x error %d", from_nodeid, remid, error); - rl->rl_result = cpu_to_le32(error); + *rl_result = cpu_to_le32(error); return error; } /* needs at least dlm_rcom + rcom_lock */ -int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) +int dlm_recover_process_copy(struct dlm_ls *ls, const struct dlm_rcom *rc, + uint64_t seq) { struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; struct dlm_rsb *r; @@ -5509,7 +5521,7 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid, result); - dlm_send_rcom_lock(r, lkb); + dlm_send_rcom_lock(r, lkb, seq); goto out; case -EEXIST: case 0: diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index aa5ad44d902b..b54e2cbbe6e2 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -12,11 +12,11 @@ #define __LOCK_DOT_H__ void dlm_dump_rsb(struct dlm_rsb *r); -void dlm_dump_rsb_name(struct dlm_ls *ls, char *name, int len); +void dlm_dump_rsb_name(struct dlm_ls *ls, const char *name, int len); void dlm_print_lkb(struct dlm_lkb *lkb); -void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms, +void dlm_receive_message_saved(struct dlm_ls *ls, const struct dlm_message *ms, uint32_t saved_seq); -void dlm_receive_buffer(union dlm_packet *p, int nodeid); +void dlm_receive_buffer(const union dlm_packet *p, int nodeid); int dlm_modes_compat(int mode1, int mode2); void dlm_put_rsb(struct dlm_rsb *r); void dlm_hold_rsb(struct dlm_rsb *r); @@ -25,8 +25,8 @@ void dlm_scan_rsbs(struct dlm_ls *ls); int dlm_lock_recovery_try(struct dlm_ls *ls); void dlm_unlock_recovery(struct dlm_ls *ls); -int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len, - unsigned int flags, int *r_nodeid, int *result); +int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name, + int len, unsigned int flags, int *r_nodeid, int *result); int dlm_search_rsb_tree(struct rb_root *tree, const void *name, int len, struct dlm_rsb **r_ret); @@ -36,8 +36,10 @@ void dlm_purge_mstcpy_locks(struct dlm_rsb *r); void dlm_recover_grant(struct dlm_ls *ls); int dlm_recover_waiters_post(struct dlm_ls *ls); void dlm_recover_waiters_pre(struct dlm_ls *ls); -int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc); -int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc); +int dlm_recover_master_copy(struct dlm_ls *ls, const struct dlm_rcom *rc, + __le32 *rl_remid, __le32 *rl_result); +int dlm_recover_process_copy(struct dlm_ls *ls, const struct dlm_rcom *rc, + uint64_t seq); int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, uint32_t flags, void *name, unsigned int namelen); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 9f14ea9f6322..f7bc22e74db2 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -863,7 +863,6 @@ struct dlm_processed_nodes { static void process_dlm_messages(struct work_struct *work) { struct processqueue_entry *pentry; - LIST_HEAD(processed_nodes); spin_lock(&processqueue_lock); pentry = list_first_entry_or_null(&processqueue, diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 77d202e4a02a..be7909ead71b 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -18,7 +18,7 @@ #include "midcomms.h" #include "lowcomms.h" -int dlm_slots_version(struct dlm_header *h) +int dlm_slots_version(const struct dlm_header *h) { if ((le32_to_cpu(h->h_version) & 0x0000FFFF) < DLM_HEADER_SLOTS) return 0; @@ -393,14 +393,9 @@ static void remove_remote_member(int nodeid) dlm_midcomms_remove_member(nodeid); } -static void clear_members_cb(int nodeid) -{ - remove_remote_member(nodeid); -} - void dlm_clear_members(struct dlm_ls *ls) { - clear_memb_list(&ls->ls_nodes, clear_members_cb); + clear_memb_list(&ls->ls_nodes, remove_remote_member); ls->ls_num_nodes = 0; } @@ -454,7 +449,7 @@ static void make_member_array(struct dlm_ls *ls) /* send a status request to all members just to establish comms connections */ -static int ping_members(struct dlm_ls *ls) +static int ping_members(struct dlm_ls *ls, uint64_t seq) { struct dlm_member *memb; int error = 0; @@ -464,7 +459,7 @@ static int ping_members(struct dlm_ls *ls) error = -EINTR; break; } - error = dlm_rcom_status(ls, memb->nodeid, 0); + error = dlm_rcom_status(ls, memb->nodeid, 0, seq); if (error) break; } @@ -612,7 +607,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) make_member_array(ls); *neg_out = neg; - error = ping_members(ls); + error = ping_members(ls, rv->seq); log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); return error; } diff --git a/fs/dlm/member.h b/fs/dlm/member.h index 433b2fac9f4a..f61cfde46314 100644 --- a/fs/dlm/member.h +++ b/fs/dlm/member.h @@ -18,7 +18,7 @@ void dlm_clear_members_gone(struct dlm_ls *ls); int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out); int dlm_is_removed(struct dlm_ls *ls, int nodeid); int dlm_is_member(struct dlm_ls *ls, int nodeid); -int dlm_slots_version(struct dlm_header *h); +int dlm_slots_version(const struct dlm_header *h); void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc, struct dlm_member *memb); void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc); diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index e1a0df67b566..f641b36a36db 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -330,18 +330,23 @@ static void midcomms_node_reset(struct midcomms_node *node) wake_up(&node->shutdown_wait); } -static struct midcomms_node *nodeid2node(int nodeid, gfp_t alloc) +static struct midcomms_node *nodeid2node(int nodeid) { - struct midcomms_node *node, *tmp; - int r = nodeid_hash(nodeid); + return __find_node(nodeid, nodeid_hash(nodeid)); +} + +int dlm_midcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) +{ + int ret, r = nodeid_hash(nodeid); + struct midcomms_node *node; - node = __find_node(nodeid, r); - if (node || !alloc) - return node; + ret = dlm_lowcomms_addr(nodeid, addr, len); + if (ret) + return ret; - node = kmalloc(sizeof(*node), alloc); + node = kmalloc(sizeof(*node), GFP_NOFS); if (!node) - return NULL; + return -ENOMEM; node->nodeid = nodeid; spin_lock_init(&node->state_lock); @@ -353,21 +358,11 @@ static struct midcomms_node *nodeid2node(int nodeid, gfp_t alloc) midcomms_node_reset(node); spin_lock(&nodes_lock); - /* check again if there was somebody else - * earlier here to add the node - */ - tmp = __find_node(nodeid, r); - if (tmp) { - spin_unlock(&nodes_lock); - kfree(node); - return tmp; - } - hlist_add_head_rcu(&node->hlist, &node_hash[r]); spin_unlock(&nodes_lock); node->debugfs = dlm_create_debug_comms_file(nodeid, node); - return node; + return 0; } static int dlm_send_ack(int nodeid, uint32_t seq) @@ -499,7 +494,8 @@ static void dlm_pas_fin_ack_rcv(struct midcomms_node *node) spin_unlock(&node->state_lock); } -static void dlm_receive_buffer_3_2_trace(uint32_t seq, union dlm_packet *p) +static void dlm_receive_buffer_3_2_trace(uint32_t seq, + const union dlm_packet *p) { switch (p->header.h_cmd) { case DLM_MSG: @@ -513,7 +509,7 @@ static void dlm_receive_buffer_3_2_trace(uint32_t seq, union dlm_packet *p) } } -static void dlm_midcomms_receive_buffer(union dlm_packet *p, +static void dlm_midcomms_receive_buffer(const union dlm_packet *p, struct midcomms_node *node, uint32_t seq) { @@ -602,113 +598,8 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, } } -static struct midcomms_node * -dlm_midcomms_recv_node_lookup(int nodeid, const union dlm_packet *p, - uint16_t msglen, int (*cb)(struct midcomms_node *node)) -{ - struct midcomms_node *node = NULL; - gfp_t allocation = 0; - int ret; - - switch (p->header.h_cmd) { - case DLM_RCOM: - if (msglen < sizeof(struct dlm_rcom)) { - log_print("rcom msg too small: %u, will skip this message from node %d", - msglen, nodeid); - return NULL; - } - - switch (p->rcom.rc_type) { - case cpu_to_le32(DLM_RCOM_NAMES): - fallthrough; - case cpu_to_le32(DLM_RCOM_NAMES_REPLY): - fallthrough; - case cpu_to_le32(DLM_RCOM_STATUS): - fallthrough; - case cpu_to_le32(DLM_RCOM_STATUS_REPLY): - node = nodeid2node(nodeid, 0); - if (node) { - spin_lock(&node->state_lock); - if (node->state != DLM_ESTABLISHED) - pr_debug("receive begin RCOM msg from node %d with state %s\n", - node->nodeid, dlm_state_str(node->state)); - - switch (node->state) { - case DLM_CLOSED: - node->state = DLM_ESTABLISHED; - pr_debug("switch node %d to state %s\n", - node->nodeid, dlm_state_str(node->state)); - break; - case DLM_ESTABLISHED: - break; - default: - spin_unlock(&node->state_lock); - return NULL; - } - spin_unlock(&node->state_lock); - } - - allocation = GFP_NOFS; - break; - default: - break; - } - - break; - default: - break; - } - - node = nodeid2node(nodeid, allocation); - if (!node) { - switch (p->header.h_cmd) { - case DLM_OPTS: - if (msglen < sizeof(struct dlm_opts)) { - log_print("opts msg too small: %u, will skip this message from node %d", - msglen, nodeid); - return NULL; - } - - log_print_ratelimited("received dlm opts message nextcmd %d from node %d in an invalid sequence", - p->opts.o_nextcmd, nodeid); - break; - default: - log_print_ratelimited("received dlm message cmd %d from node %d in an invalid sequence", - p->header.h_cmd, nodeid); - break; - } - - return NULL; - } - - ret = cb(node); - if (ret < 0) - return NULL; - - return node; -} - -static int dlm_midcomms_version_check_3_2(struct midcomms_node *node) -{ - switch (node->version) { - case DLM_VERSION_NOT_SET: - node->version = DLM_VERSION_3_2; - wake_up(&node->shutdown_wait); - log_print("version 0x%08x for node %d detected", DLM_VERSION_3_2, - node->nodeid); - break; - case DLM_VERSION_3_2: - break; - default: - log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x", - DLM_VERSION_3_2, node->nodeid, node->version); - return -1; - } - - return 0; -} - -static int dlm_opts_check_msglen(union dlm_packet *p, uint16_t msglen, int nodeid) +static int dlm_opts_check_msglen(const union dlm_packet *p, uint16_t msglen, + int nodeid) { int len = msglen; @@ -757,7 +648,7 @@ static int dlm_opts_check_msglen(union dlm_packet *p, uint16_t msglen, int nodei return 0; } -static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid) +static void dlm_midcomms_receive_buffer_3_2(const union dlm_packet *p, int nodeid) { uint16_t msglen = le16_to_cpu(p->header.h_length); struct midcomms_node *node; @@ -765,10 +656,37 @@ static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid) int ret, idx; idx = srcu_read_lock(&nodes_srcu); - node = dlm_midcomms_recv_node_lookup(nodeid, p, msglen, - dlm_midcomms_version_check_3_2); - if (!node) + node = nodeid2node(nodeid); + if (WARN_ON_ONCE(!node)) + goto out; + + switch (node->version) { + case DLM_VERSION_NOT_SET: + node->version = DLM_VERSION_3_2; + wake_up(&node->shutdown_wait); + log_print("version 0x%08x for node %d detected", DLM_VERSION_3_2, + node->nodeid); + + spin_lock(&node->state_lock); + switch (node->state) { + case DLM_CLOSED: + node->state = DLM_ESTABLISHED; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + break; + default: + break; + } + spin_unlock(&node->state_lock); + + break; + case DLM_VERSION_3_2: + break; + default: + log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x", + DLM_VERSION_3_2, node->nodeid, node->version); goto out; + } switch (p->header.h_cmd) { case DLM_RCOM: @@ -858,8 +776,19 @@ out: srcu_read_unlock(&nodes_srcu, idx); } -static int dlm_midcomms_version_check_3_1(struct midcomms_node *node) +static void dlm_midcomms_receive_buffer_3_1(const union dlm_packet *p, int nodeid) { + uint16_t msglen = le16_to_cpu(p->header.h_length); + struct midcomms_node *node; + int idx; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid); + if (WARN_ON_ONCE(!node)) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + switch (node->version) { case DLM_VERSION_NOT_SET: node->version = DLM_VERSION_3_1; @@ -872,22 +801,6 @@ static int dlm_midcomms_version_check_3_1(struct midcomms_node *node) default: log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x", DLM_VERSION_3_1, node->nodeid, node->version); - return -1; - } - - return 0; -} - -static void dlm_midcomms_receive_buffer_3_1(union dlm_packet *p, int nodeid) -{ - uint16_t msglen = le16_to_cpu(p->header.h_length); - struct midcomms_node *node; - int idx; - - idx = srcu_read_lock(&nodes_srcu); - node = dlm_midcomms_recv_node_lookup(nodeid, p, msglen, - dlm_midcomms_version_check_3_1); - if (!node) { srcu_read_unlock(&nodes_srcu, idx); return; } @@ -977,10 +890,10 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) switch (hd->h_version) { case cpu_to_le32(DLM_VERSION_3_1): - dlm_midcomms_receive_buffer_3_1((union dlm_packet *)ptr, nodeid); + dlm_midcomms_receive_buffer_3_1((const union dlm_packet *)ptr, nodeid); break; case cpu_to_le32(DLM_VERSION_3_2): - dlm_midcomms_receive_buffer_3_2((union dlm_packet *)ptr, nodeid); + dlm_midcomms_receive_buffer_3_2((const union dlm_packet *)ptr, nodeid); break; default: log_print("received invalid version header: %u from node %d, will skip this message", @@ -1003,8 +916,8 @@ void dlm_midcomms_unack_msg_resend(int nodeid) int idx, ret; idx = srcu_read_lock(&nodes_srcu); - node = nodeid2node(nodeid, 0); - if (!node) { + node = nodeid2node(nodeid); + if (WARN_ON_ONCE(!node)) { srcu_read_unlock(&nodes_srcu, idx); return; } @@ -1090,11 +1003,9 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, int idx; idx = srcu_read_lock(&nodes_srcu); - node = nodeid2node(nodeid, 0); - if (!node) { - WARN_ON_ONCE(1); + node = nodeid2node(nodeid); + if (WARN_ON_ONCE(!node)) goto err; - } /* this is a bug, however we going on and hope it will be resolved */ WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags)); @@ -1235,8 +1146,34 @@ void dlm_midcomms_init(void) dlm_lowcomms_init(); } +static void midcomms_node_release(struct rcu_head *rcu) +{ + struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu); + + WARN_ON_ONCE(atomic_read(&node->send_queue_cnt)); + dlm_send_queue_flush(node); + kfree(node); +} + void dlm_midcomms_exit(void) { + struct midcomms_node *node; + int i, idx; + + idx = srcu_read_lock(&nodes_srcu); + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { + dlm_delete_debug_comms_file(node->debugfs); + + spin_lock(&nodes_lock); + hlist_del_rcu(&node->hlist); + spin_unlock(&nodes_lock); + + call_srcu(&nodes_srcu, &node->rcu, midcomms_node_release); + } + } + srcu_read_unlock(&nodes_srcu, idx); + dlm_lowcomms_exit(); } @@ -1277,8 +1214,8 @@ void dlm_midcomms_add_member(int nodeid) int idx; idx = srcu_read_lock(&nodes_srcu); - node = nodeid2node(nodeid, GFP_NOFS); - if (!node) { + node = nodeid2node(nodeid); + if (WARN_ON_ONCE(!node)) { srcu_read_unlock(&nodes_srcu, idx); return; } @@ -1322,8 +1259,8 @@ void dlm_midcomms_remove_member(int nodeid) int idx; idx = srcu_read_lock(&nodes_srcu); - node = nodeid2node(nodeid, 0); - if (!node) { + node = nodeid2node(nodeid); + if (WARN_ON_ONCE(!node)) { srcu_read_unlock(&nodes_srcu, idx); return; } @@ -1367,15 +1304,6 @@ void dlm_midcomms_remove_member(int nodeid) srcu_read_unlock(&nodes_srcu, idx); } -static void midcomms_node_release(struct rcu_head *rcu) -{ - struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu); - - WARN_ON_ONCE(atomic_read(&node->send_queue_cnt)); - dlm_send_queue_flush(node); - kfree(node); -} - void dlm_midcomms_version_wait(void) { struct midcomms_node *node; @@ -1438,7 +1366,7 @@ static void midcomms_shutdown(struct midcomms_node *node) node->state == DLM_CLOSED || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags), DLM_SHUTDOWN_TIMEOUT); - if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags)) + if (!ret) pr_debug("active shutdown timed out for node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); else @@ -1456,14 +1384,6 @@ void dlm_midcomms_shutdown(void) for (i = 0; i < CONN_HASH_SIZE; i++) { hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { midcomms_shutdown(node); - - dlm_delete_debug_comms_file(node->debugfs); - - spin_lock(&nodes_lock); - hlist_del_rcu(&node->hlist); - spin_unlock(&nodes_lock); - - call_srcu(&nodes_srcu, &node->rcu, midcomms_node_release); } } srcu_read_unlock(&nodes_srcu, idx); @@ -1479,7 +1399,7 @@ int dlm_midcomms_close(int nodeid) idx = srcu_read_lock(&nodes_srcu); /* Abort pending close/remove operation */ - node = nodeid2node(nodeid, 0); + node = nodeid2node(nodeid); if (node) { /* let shutdown waiters leave */ set_bit(DLM_NODE_FLAG_CLOSE, &node->flags); @@ -1489,20 +1409,32 @@ int dlm_midcomms_close(int nodeid) synchronize_srcu(&nodes_srcu); - idx = srcu_read_lock(&nodes_srcu); mutex_lock(&close_lock); - node = nodeid2node(nodeid, 0); + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid); if (!node) { - mutex_unlock(&close_lock); srcu_read_unlock(&nodes_srcu, idx); + mutex_unlock(&close_lock); return dlm_lowcomms_close(nodeid); } ret = dlm_lowcomms_close(nodeid); - spin_lock(&node->state_lock); - midcomms_node_reset(node); - spin_unlock(&node->state_lock); + dlm_delete_debug_comms_file(node->debugfs); + + spin_lock(&nodes_lock); + hlist_del_rcu(&node->hlist); + spin_unlock(&nodes_lock); srcu_read_unlock(&nodes_srcu, idx); + + /* wait that all readers left until flush send queue */ + synchronize_srcu(&nodes_srcu); + + /* drop all pending dlm messages, this is fine as + * this function get called when the node is fenced + */ + dlm_send_queue_flush(node); + + call_srcu(&nodes_srcu, &node->rcu, midcomms_node_release); mutex_unlock(&close_lock); return ret; diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index 9f8c9605013d..e7246fb3ef57 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -20,6 +20,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, gfp_t allocation, char **ppc); void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, const void *name, int namelen); +int dlm_midcomms_addr(int nodeid, struct sockaddr_storage *addr, int len); void dlm_midcomms_version_wait(void); int dlm_midcomms_close(int nodeid); int dlm_midcomms_start(void); diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index 70a4752ed913..e6b4c1a21446 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -11,6 +11,8 @@ #include <linux/dlm_plock.h> #include <linux/slab.h> +#include <trace/events/dlm.h> + #include "dlm_internal.h" #include "lockspace.h" @@ -42,6 +44,27 @@ static inline void set_version(struct dlm_plock_info *info) info->version[2] = DLM_PLOCK_VERSION_PATCH; } +static struct plock_op *plock_lookup_waiter(const struct dlm_plock_info *info) +{ + struct plock_op *op = NULL, *iter; + + list_for_each_entry(iter, &recv_list, list) { + if (iter->info.fsid == info->fsid && + iter->info.number == info->number && + iter->info.owner == info->owner && + iter->info.pid == info->pid && + iter->info.start == info->start && + iter->info.end == info->end && + iter->info.ex == info->ex && + iter->info.wait) { + op = iter; + break; + } + } + + return op; +} + static int check_version(struct dlm_plock_info *info) { if ((DLM_PLOCK_VERSION_MAJOR != info->version[0]) || @@ -74,30 +97,26 @@ static void send_op(struct plock_op *op) wake_up(&send_wq); } -/* If a process was killed while waiting for the only plock on a file, - locks_remove_posix will not see any lock on the file so it won't - send an unlock-close to us to pass on to userspace to clean up the - abandoned waiter. So, we have to insert the unlock-close when the - lock call is interrupted. */ - -static void do_unlock_close(const struct dlm_plock_info *info) +static int do_lock_cancel(const struct dlm_plock_info *orig_info) { struct plock_op *op; + int rv; op = kzalloc(sizeof(*op), GFP_NOFS); if (!op) - return; + return -ENOMEM; + + op->info = *orig_info; + op->info.optype = DLM_PLOCK_OP_CANCEL; + op->info.wait = 0; - op->info.optype = DLM_PLOCK_OP_UNLOCK; - op->info.pid = info->pid; - op->info.fsid = info->fsid; - op->info.number = info->number; - op->info.start = 0; - op->info.end = OFFSET_MAX; - op->info.owner = info->owner; - - op->info.flags |= DLM_PLOCK_FL_CLOSE; send_op(op); + wait_event(recv_wq, (op->done != 0)); + + rv = op->info.rv; + + dlm_release_plock_op(op); + return rv; } int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, @@ -156,7 +175,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, send_op(op); if (op->info.wait) { - rv = wait_event_killable(recv_wq, (op->done != 0)); + rv = wait_event_interruptible(recv_wq, (op->done != 0)); if (rv == -ERESTARTSYS) { spin_lock(&ops_lock); /* recheck under ops_lock if we got a done != 0, @@ -166,17 +185,37 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, spin_unlock(&ops_lock); goto do_lock_wait; } - list_del(&op->list); spin_unlock(&ops_lock); + rv = do_lock_cancel(&op->info); + switch (rv) { + case 0: + /* waiter was deleted in user space, answer will never come + * remove original request. The original request must be + * on recv_list because the answer of do_lock_cancel() + * synchronized it. + */ + spin_lock(&ops_lock); + list_del(&op->list); + spin_unlock(&ops_lock); + rv = -EINTR; + break; + case -ENOENT: + /* cancellation wasn't successful but op should be done */ + fallthrough; + default: + /* internal error doing cancel we need to wait */ + goto wait; + } + log_debug(ls, "%s: wait interrupted %x %llx pid %d", __func__, ls->ls_global_id, (unsigned long long)number, op->info.pid); - do_unlock_close(&op->info); dlm_release_plock_op(op); goto out; } } else { +wait: wait_event(recv_wq, (op->done != 0)); } @@ -240,8 +279,8 @@ static int dlm_plock_callback(struct plock_op *op) rv = notify(fl, 0); if (rv) { /* XXX: We need to cancel the fs lock here: */ - log_print("dlm_plock_callback: lock granted after lock request " - "failed; dangling lock!\n"); + log_print("%s: lock granted after lock request failed; dangling lock!", + __func__); goto out; } @@ -318,6 +357,75 @@ out: } EXPORT_SYMBOL_GPL(dlm_posix_unlock); +/* + * NOTE: This implementation can only handle async lock requests as nfs + * do it. It cannot handle cancellation of a pending lock request sitting + * in wait_event(), but for now only nfs is the only user local kernel + * user. + */ +int dlm_posix_cancel(dlm_lockspace_t *lockspace, u64 number, struct file *file, + struct file_lock *fl) +{ + struct dlm_plock_info info; + struct plock_op *op; + struct dlm_ls *ls; + int rv; + + /* this only works for async request for now and nfs is the only + * kernel user right now. + */ + if (WARN_ON_ONCE(!fl->fl_lmops || !fl->fl_lmops->lm_grant)) + return -EOPNOTSUPP; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + + memset(&info, 0, sizeof(info)); + info.pid = fl->fl_pid; + info.ex = (fl->fl_type == F_WRLCK); + info.fsid = ls->ls_global_id; + dlm_put_lockspace(ls); + info.number = number; + info.start = fl->fl_start; + info.end = fl->fl_end; + info.owner = (__u64)fl->fl_pid; + + rv = do_lock_cancel(&info); + switch (rv) { + case 0: + spin_lock(&ops_lock); + /* lock request to cancel must be on recv_list because + * do_lock_cancel() synchronizes it. + */ + op = plock_lookup_waiter(&info); + if (WARN_ON_ONCE(!op)) { + spin_unlock(&ops_lock); + rv = -ENOLCK; + break; + } + + list_del(&op->list); + spin_unlock(&ops_lock); + WARN_ON(op->info.optype != DLM_PLOCK_OP_LOCK); + op->data->callback(op->data->fl, -EINTR); + dlm_release_plock_op(op); + rv = -EINTR; + break; + case -ENOENT: + /* if cancel wasn't successful we probably were to late + * or it was a non-blocking lock request, so just unlock it. + */ + rv = dlm_posix_unlock(lockspace, number, file, fl); + break; + default: + break; + } + + return rv; +} +EXPORT_SYMBOL_GPL(dlm_posix_cancel); + int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, struct file_lock *fl) { @@ -403,6 +511,8 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count, if (!op) return -EAGAIN; + trace_dlm_plock_read(&info); + /* there is no need to get a reply from userspace for unlocks that were generated by the vfs cleaning up for a close (the process did not make an unlock call). */ @@ -430,6 +540,8 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, if (copy_from_user(&info, u, sizeof(info))) return -EFAULT; + trace_dlm_plock_write(&info); + if (check_version(&info)) return -EINVAL; @@ -441,22 +553,11 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, */ spin_lock(&ops_lock); if (info.wait) { - list_for_each_entry(iter, &recv_list, list) { - if (iter->info.fsid == info.fsid && - iter->info.number == info.number && - iter->info.owner == info.owner && - iter->info.pid == info.pid && - iter->info.start == info.start && - iter->info.end == info.end && - iter->info.ex == info.ex && - iter->info.wait) { - op = iter; - break; - } - } + op = plock_lookup_waiter(&info); } else { list_for_each_entry(iter, &recv_list, list) { - if (!iter->info.wait) { + if (!iter->info.wait && + iter->info.fsid == info.fsid) { op = iter; break; } @@ -468,8 +569,7 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, if (info.wait) WARN_ON(op->info.optype != DLM_PLOCK_OP_LOCK); else - WARN_ON(op->info.fsid != info.fsid || - op->info.number != info.number || + WARN_ON(op->info.number != info.number || op->info.owner != info.owner || op->info.optype != info.optype); @@ -534,5 +634,7 @@ int dlm_plock_init(void) void dlm_plock_exit(void) { misc_deregister(&plock_dev_misc); + WARN_ON(!list_empty(&send_list)); + WARN_ON(!list_empty(&recv_list)); } diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index f4afdf892f78..3b734aed26b5 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -28,7 +28,8 @@ static int rcom_response(struct dlm_ls *ls) } static void _create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, - struct dlm_rcom **rc_ret, char *mb, int mb_len) + struct dlm_rcom **rc_ret, char *mb, int mb_len, + uint64_t seq) { struct dlm_rcom *rc; @@ -41,16 +42,14 @@ static void _create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, rc->rc_header.h_cmd = DLM_RCOM; rc->rc_type = cpu_to_le32(type); - - spin_lock(&ls->ls_recover_lock); - rc->rc_seq = cpu_to_le64(ls->ls_recover_seq); - spin_unlock(&ls->ls_recover_lock); + rc->rc_seq = cpu_to_le64(seq); *rc_ret = rc; } static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, - struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret) + struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret, + uint64_t seq) { int mb_len = sizeof(struct dlm_rcom) + len; struct dlm_mhandle *mh; @@ -63,14 +62,14 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, return -ENOBUFS; } - _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len); + _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len, seq); *mh_ret = mh; return 0; } static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type, int len, struct dlm_rcom **rc_ret, - struct dlm_msg **msg_ret) + struct dlm_msg **msg_ret, uint64_t seq) { int mb_len = sizeof(struct dlm_rcom) + len; struct dlm_msg *msg; @@ -84,7 +83,7 @@ static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type, return -ENOBUFS; } - _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len); + _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len, seq); *msg_ret = msg; return 0; } @@ -170,7 +169,8 @@ static void disallow_sync_reply(struct dlm_ls *ls) * node's rcom_config. */ -int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) +int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags, + uint64_t seq) { struct dlm_rcom *rc; struct dlm_msg *msg; @@ -186,7 +186,8 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) retry: error = create_rcom_stateless(ls, nodeid, DLM_RCOM_STATUS, - sizeof(struct rcom_status), &rc, &msg); + sizeof(struct rcom_status), &rc, &msg, + seq); if (error) goto out; @@ -220,7 +221,9 @@ retry: return error; } -static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) +static void receive_rcom_status(struct dlm_ls *ls, + const struct dlm_rcom *rc_in, + uint64_t seq) { struct dlm_rcom *rc; struct rcom_status *rs; @@ -251,7 +254,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) do_create: error = create_rcom_stateless(ls, nodeid, DLM_RCOM_STATUS_REPLY, - len, &rc, &msg); + len, &rc, &msg, seq); if (error) return; @@ -281,7 +284,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) send_rcom_stateless(msg, rc); } -static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) +static void receive_sync_reply(struct dlm_ls *ls, const struct dlm_rcom *rc_in) { spin_lock(&ls->ls_rcom_spin); if (!test_bit(LSFL_RCOM_WAIT, &ls->ls_flags) || @@ -302,17 +305,18 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) spin_unlock(&ls->ls_rcom_spin); } -int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) +int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, + int last_len, uint64_t seq) { + struct dlm_mhandle *mh; struct dlm_rcom *rc; - struct dlm_msg *msg; int error = 0; ls->ls_recover_nodeid = nodeid; retry: - error = create_rcom_stateless(ls, nodeid, DLM_RCOM_NAMES, last_len, - &rc, &msg); + error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, + &rc, &mh, seq); if (error) goto out; memcpy(rc->rc_buf, last_name, last_len); @@ -320,7 +324,7 @@ retry: allow_sync_reply(ls, &rc->rc_id); memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE); - send_rcom_stateless(msg, rc); + send_rcom(mh, rc); error = dlm_wait_function(ls, &rcom_response); disallow_sync_reply(ls); @@ -330,19 +334,20 @@ retry: return error; } -static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) +static void receive_rcom_names(struct dlm_ls *ls, const struct dlm_rcom *rc_in, + uint64_t seq) { + struct dlm_mhandle *mh; struct dlm_rcom *rc; int error, inlen, outlen, nodeid; - struct dlm_msg *msg; nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid); inlen = le16_to_cpu(rc_in->rc_header.h_length) - sizeof(struct dlm_rcom); outlen = DLM_MAX_APP_BUFSIZE - sizeof(struct dlm_rcom); - error = create_rcom_stateless(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, - &rc, &msg); + error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, + &rc, &mh, seq); if (error) return; rc->rc_id = rc_in->rc_id; @@ -350,10 +355,10 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen, nodeid); - send_rcom_stateless(msg, rc); + send_rcom(mh, rc); } -int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) +int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid, uint64_t seq) { struct dlm_rcom *rc; struct dlm_mhandle *mh; @@ -361,7 +366,7 @@ int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) int error; error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length, - &rc, &mh); + &rc, &mh, seq); if (error) goto out; memcpy(rc->rc_buf, r->res_name, r->res_length); @@ -372,7 +377,8 @@ int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) return error; } -static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) +static void receive_rcom_lookup(struct dlm_ls *ls, + const struct dlm_rcom *rc_in, uint64_t seq) { struct dlm_rcom *rc; struct dlm_mhandle *mh; @@ -387,7 +393,8 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) return; } - error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh); + error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh, + seq); if (error) return; @@ -402,7 +409,8 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) send_rcom(mh, rc); } -static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) +static void receive_rcom_lookup_reply(struct dlm_ls *ls, + const struct dlm_rcom *rc_in) { dlm_recover_master_reply(ls, rc_in); } @@ -437,7 +445,7 @@ static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb, memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen); } -int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb, uint64_t seq) { struct dlm_ls *ls = r->res_ls; struct dlm_rcom *rc; @@ -448,7 +456,8 @@ int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) if (lkb->lkb_lvbptr) len += ls->ls_lvblen; - error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh); + error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh, + seq); if (error) goto out; @@ -462,23 +471,28 @@ int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) } /* needs at least dlm_rcom + rcom_lock */ -static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in) +static void receive_rcom_lock(struct dlm_ls *ls, const struct dlm_rcom *rc_in, + uint64_t seq) { + __le32 rl_remid, rl_result; + struct rcom_lock *rl; struct dlm_rcom *rc; struct dlm_mhandle *mh; int error, nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid); - dlm_recover_master_copy(ls, rc_in); + dlm_recover_master_copy(ls, rc_in, &rl_remid, &rl_result); error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY, - sizeof(struct rcom_lock), &rc, &mh); + sizeof(struct rcom_lock), &rc, &mh, seq); if (error) return; - /* We send back the same rcom_lock struct we received, but - dlm_recover_master_copy() has filled in rl_remid and rl_result */ - memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock)); + rl = (struct rcom_lock *)rc->rc_buf; + /* set rl_remid and rl_result from dlm_recover_master_copy() */ + rl->rl_remid = rl_remid; + rl->rl_result = rl_result; + rc->rc_id = rc_in->rc_id; rc->rc_seq_reply = rc_in->rc_seq; @@ -488,7 +502,7 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in) /* If the lockspace doesn't exist then still send a status message back; it's possible that it just doesn't have its global_id yet. */ -int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) +int dlm_send_ls_not_ready(int nodeid, const struct dlm_rcom *rc_in) { struct dlm_rcom *rc; struct rcom_config *rf; @@ -566,7 +580,7 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) /* Called by dlm_recv; corresponds to dlm_receive_message() but special recovery-only comms are sent through here. */ -void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) +void dlm_receive_rcom(struct dlm_ls *ls, const struct dlm_rcom *rc, int nodeid) { int lock_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_lock); int stop, reply = 0, names = 0, lookup = 0, lock = 0; @@ -620,21 +634,21 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) switch (rc->rc_type) { case cpu_to_le32(DLM_RCOM_STATUS): - receive_rcom_status(ls, rc); + receive_rcom_status(ls, rc, seq); break; case cpu_to_le32(DLM_RCOM_NAMES): - receive_rcom_names(ls, rc); + receive_rcom_names(ls, rc, seq); break; case cpu_to_le32(DLM_RCOM_LOOKUP): - receive_rcom_lookup(ls, rc); + receive_rcom_lookup(ls, rc, seq); break; case cpu_to_le32(DLM_RCOM_LOCK): if (le16_to_cpu(rc->rc_header.h_length) < lock_size) goto Eshort; - receive_rcom_lock(ls, rc); + receive_rcom_lock(ls, rc, seq); break; case cpu_to_le32(DLM_RCOM_STATUS_REPLY): @@ -652,7 +666,7 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) case cpu_to_le32(DLM_RCOM_LOCK_REPLY): if (le16_to_cpu(rc->rc_header.h_length) < lock_size) goto Eshort; - dlm_recover_process_copy(ls, rc); + dlm_recover_process_copy(ls, rc, seq); break; default: diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h index 454d3c4814ab..765926ae0020 100644 --- a/fs/dlm/rcom.h +++ b/fs/dlm/rcom.h @@ -12,12 +12,15 @@ #ifndef __RCOM_DOT_H__ #define __RCOM_DOT_H__ -int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags); -int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len); -int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid); -int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); -void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid); -int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in); +int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags, + uint64_t seq); +int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, + int last_len, uint64_t seq); +int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid, uint64_t seq); +int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb, uint64_t seq); +void dlm_receive_rcom(struct dlm_ls *ls, const struct dlm_rcom *rc, + int nodeid); +int dlm_send_ls_not_ready(int nodeid, const struct dlm_rcom *rc_in); #endif diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index 29d71a5018d4..53917c0aa3c0 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -93,7 +93,7 @@ void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status) } static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status, - int save_slots) + int save_slots, uint64_t seq) { struct dlm_rcom *rc = ls->ls_recover_buf; struct dlm_member *memb; @@ -107,7 +107,7 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status, goto out; } - error = dlm_rcom_status(ls, memb->nodeid, 0); + error = dlm_rcom_status(ls, memb->nodeid, 0, seq); if (error) goto out; @@ -126,7 +126,7 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status, } static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status, - uint32_t status_flags) + uint32_t status_flags, uint64_t seq) { struct dlm_rcom *rc = ls->ls_recover_buf; int error = 0, delay = 0, nodeid = ls->ls_low_nodeid; @@ -137,7 +137,7 @@ static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status, goto out; } - error = dlm_rcom_status(ls, nodeid, status_flags); + error = dlm_rcom_status(ls, nodeid, status_flags, seq); if (error) break; @@ -151,22 +151,22 @@ static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status, return error; } -static int wait_status(struct dlm_ls *ls, uint32_t status) +static int wait_status(struct dlm_ls *ls, uint32_t status, uint64_t seq) { uint32_t status_all = status << 1; int error; if (ls->ls_low_nodeid == dlm_our_nodeid()) { - error = wait_status_all(ls, status, 0); + error = wait_status_all(ls, status, 0, seq); if (!error) dlm_set_recover_status(ls, status_all); } else - error = wait_status_low(ls, status_all, 0); + error = wait_status_low(ls, status_all, 0, seq); return error; } -int dlm_recover_members_wait(struct dlm_ls *ls) +int dlm_recover_members_wait(struct dlm_ls *ls, uint64_t seq) { struct dlm_member *memb; struct dlm_slot *slots; @@ -180,7 +180,7 @@ int dlm_recover_members_wait(struct dlm_ls *ls) } if (ls->ls_low_nodeid == dlm_our_nodeid()) { - error = wait_status_all(ls, DLM_RS_NODES, 1); + error = wait_status_all(ls, DLM_RS_NODES, 1, seq); if (error) goto out; @@ -199,7 +199,8 @@ int dlm_recover_members_wait(struct dlm_ls *ls) dlm_set_recover_status(ls, DLM_RS_NODES_ALL); } } else { - error = wait_status_low(ls, DLM_RS_NODES_ALL, DLM_RSF_NEED_SLOTS); + error = wait_status_low(ls, DLM_RS_NODES_ALL, + DLM_RSF_NEED_SLOTS, seq); if (error) goto out; @@ -209,19 +210,19 @@ int dlm_recover_members_wait(struct dlm_ls *ls) return error; } -int dlm_recover_directory_wait(struct dlm_ls *ls) +int dlm_recover_directory_wait(struct dlm_ls *ls, uint64_t seq) { - return wait_status(ls, DLM_RS_DIR); + return wait_status(ls, DLM_RS_DIR, seq); } -int dlm_recover_locks_wait(struct dlm_ls *ls) +int dlm_recover_locks_wait(struct dlm_ls *ls, uint64_t seq) { - return wait_status(ls, DLM_RS_LOCKS); + return wait_status(ls, DLM_RS_LOCKS, seq); } -int dlm_recover_done_wait(struct dlm_ls *ls) +int dlm_recover_done_wait(struct dlm_ls *ls, uint64_t seq) { - return wait_status(ls, DLM_RS_DONE); + return wait_status(ls, DLM_RS_DONE, seq); } /* @@ -441,7 +442,7 @@ static void set_new_master(struct dlm_rsb *r) * equals our_nodeid below). */ -static int recover_master(struct dlm_rsb *r, unsigned int *count) +static int recover_master(struct dlm_rsb *r, unsigned int *count, uint64_t seq) { struct dlm_ls *ls = r->res_ls; int our_nodeid, dir_nodeid; @@ -472,7 +473,7 @@ static int recover_master(struct dlm_rsb *r, unsigned int *count) error = 0; } else { recover_idr_add(r); - error = dlm_send_rcom_lookup(r, dir_nodeid); + error = dlm_send_rcom_lookup(r, dir_nodeid, seq); } (*count)++; @@ -520,7 +521,7 @@ static int recover_master_static(struct dlm_rsb *r, unsigned int *count) * the correct dir node. */ -int dlm_recover_masters(struct dlm_ls *ls) +int dlm_recover_masters(struct dlm_ls *ls, uint64_t seq) { struct dlm_rsb *r; unsigned int total = 0; @@ -542,7 +543,7 @@ int dlm_recover_masters(struct dlm_ls *ls) if (nodir) error = recover_master_static(r, &count); else - error = recover_master(r, &count); + error = recover_master(r, &count, seq); unlock_rsb(r); cond_resched(); total++; @@ -563,7 +564,7 @@ int dlm_recover_masters(struct dlm_ls *ls) return error; } -int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc) +int dlm_recover_master_reply(struct dlm_ls *ls, const struct dlm_rcom *rc) { struct dlm_rsb *r; int ret_nodeid, new_master; @@ -614,13 +615,14 @@ int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc) * an equal number of replies then recovery for the rsb is done */ -static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head) +static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head, + uint64_t seq) { struct dlm_lkb *lkb; int error = 0; list_for_each_entry(lkb, head, lkb_statequeue) { - error = dlm_send_rcom_lock(r, lkb); + error = dlm_send_rcom_lock(r, lkb, seq); if (error) break; r->res_recover_locks_count++; @@ -629,7 +631,7 @@ static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head) return error; } -static int recover_locks(struct dlm_rsb *r) +static int recover_locks(struct dlm_rsb *r, uint64_t seq) { int error = 0; @@ -637,13 +639,13 @@ static int recover_locks(struct dlm_rsb *r) DLM_ASSERT(!r->res_recover_locks_count, dlm_dump_rsb(r);); - error = recover_locks_queue(r, &r->res_grantqueue); + error = recover_locks_queue(r, &r->res_grantqueue, seq); if (error) goto out; - error = recover_locks_queue(r, &r->res_convertqueue); + error = recover_locks_queue(r, &r->res_convertqueue, seq); if (error) goto out; - error = recover_locks_queue(r, &r->res_waitqueue); + error = recover_locks_queue(r, &r->res_waitqueue, seq); if (error) goto out; @@ -656,7 +658,7 @@ static int recover_locks(struct dlm_rsb *r) return error; } -int dlm_recover_locks(struct dlm_ls *ls) +int dlm_recover_locks(struct dlm_ls *ls, uint64_t seq) { struct dlm_rsb *r; int error, count = 0; @@ -677,7 +679,7 @@ int dlm_recover_locks(struct dlm_ls *ls) goto out; } - error = recover_locks(r); + error = recover_locks(r, seq); if (error) { up_read(&ls->ls_root_sem); goto out; diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h index 235e0d25cd48..dbc51013ecad 100644 --- a/fs/dlm/recover.h +++ b/fs/dlm/recover.h @@ -15,13 +15,13 @@ int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls)); uint32_t dlm_recover_status(struct dlm_ls *ls); void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status); -int dlm_recover_members_wait(struct dlm_ls *ls); -int dlm_recover_directory_wait(struct dlm_ls *ls); -int dlm_recover_locks_wait(struct dlm_ls *ls); -int dlm_recover_done_wait(struct dlm_ls *ls); -int dlm_recover_masters(struct dlm_ls *ls); -int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc); -int dlm_recover_locks(struct dlm_ls *ls); +int dlm_recover_members_wait(struct dlm_ls *ls, uint64_t seq); +int dlm_recover_directory_wait(struct dlm_ls *ls, uint64_t seq); +int dlm_recover_locks_wait(struct dlm_ls *ls, uint64_t seq); +int dlm_recover_done_wait(struct dlm_ls *ls, uint64_t seq); +int dlm_recover_masters(struct dlm_ls *ls, uint64_t seq); +int dlm_recover_master_reply(struct dlm_ls *ls, const struct dlm_rcom *rc); +int dlm_recover_locks(struct dlm_ls *ls, uint64_t seq); void dlm_recovered_lock(struct dlm_rsb *r); int dlm_create_root_list(struct dlm_ls *ls); void dlm_release_root_list(struct dlm_ls *ls); diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 19da816cfb09..4d17491dea2f 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -90,7 +90,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_set_recover_status(ls, DLM_RS_NODES); - error = dlm_recover_members_wait(ls); + error = dlm_recover_members_wait(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_members_wait error %d", error); goto fail; @@ -103,7 +103,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) * nodes their master rsb names that hash to us. */ - error = dlm_recover_directory(ls); + error = dlm_recover_directory(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_directory error %d", error); goto fail; @@ -111,7 +111,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_set_recover_status(ls, DLM_RS_DIR); - error = dlm_recover_directory_wait(ls); + error = dlm_recover_directory_wait(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_directory_wait error %d", error); goto fail; @@ -145,7 +145,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) * departed nodes. */ - error = dlm_recover_masters(ls); + error = dlm_recover_masters(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_masters error %d", error); goto fail; @@ -155,7 +155,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) * Send our locks on remastered rsb's to the new masters. */ - error = dlm_recover_locks(ls); + error = dlm_recover_locks(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_locks error %d", error); goto fail; @@ -163,7 +163,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_set_recover_status(ls, DLM_RS_LOCKS); - error = dlm_recover_locks_wait(ls); + error = dlm_recover_locks_wait(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_locks_wait error %d", error); goto fail; @@ -187,7 +187,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) */ dlm_set_recover_status(ls, DLM_RS_LOCKS); - error = dlm_recover_locks_wait(ls); + error = dlm_recover_locks_wait(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_locks_wait error %d", error); goto fail; @@ -206,7 +206,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_set_recover_status(ls, DLM_RS_DONE); - error = dlm_recover_done_wait(ls); + error = dlm_recover_done_wait(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_done_wait error %d", error); goto fail; diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c index 8be2893ad15b..892d6ca21e74 100644 --- a/fs/dlm/requestqueue.c +++ b/fs/dlm/requestqueue.c @@ -30,7 +30,8 @@ struct rq_entry { * lockspace is enabled on some while still suspended on others. */ -void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms) +void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, + const struct dlm_message *ms) { struct rq_entry *e; int length = le16_to_cpu(ms->m_header.h_length) - diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h index 4e403469a845..42bfe23ceabe 100644 --- a/fs/dlm/requestqueue.h +++ b/fs/dlm/requestqueue.h @@ -11,7 +11,8 @@ #ifndef __REQUESTQUEUE_DOT_H__ #define __REQUESTQUEUE_DOT_H__ -void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms); +void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, + const struct dlm_message *ms); int dlm_process_requestqueue(struct dlm_ls *ls); void dlm_wait_requestqueue(struct dlm_ls *ls); void dlm_purge_requestqueue(struct dlm_ls *ls); diff --git a/fs/drop_caches.c b/fs/drop_caches.c index e619c31b6bd9..b9575957a7c2 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -10,6 +10,7 @@ #include <linux/writeback.h> #include <linux/sysctl.h> #include <linux/gfp.h> +#include <linux/swap.h> #include "internal.h" /* A global variable is a bit ugly, but it keeps the code simple */ @@ -59,6 +60,7 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write, static int stfu; if (sysctl_drop_caches & 1) { + lru_add_drain_all(); iterate_supers(drop_pagecache_sb, NULL); count_vm_event(DROP_PAGECACHE); } diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index c16f0d660cb7..03bd55069d86 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -441,10 +441,10 @@ int ecryptfs_encrypt_page(struct page *page) } lower_offset = lower_offset_for_page(crypt_stat, page); - enc_extent_virt = kmap(enc_extent_page); + enc_extent_virt = kmap_local_page(enc_extent_page); rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset, PAGE_SIZE); - kunmap(enc_extent_page); + kunmap_local(enc_extent_virt); if (rc < 0) { ecryptfs_printk(KERN_ERR, "Error attempting to write lower page; rc = [%d]\n", @@ -490,10 +490,10 @@ int ecryptfs_decrypt_page(struct page *page) BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); lower_offset = lower_offset_for_page(crypt_stat, page); - page_virt = kmap(page); + page_virt = kmap_local_page(page); rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_SIZE, ecryptfs_inode); - kunmap(page); + kunmap_local(page_virt); if (rc < 0) { ecryptfs_printk(KERN_ERR, "Error attempting to read lower page; rc = [%d]\n", diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 83274915ba6d..992d9c7e64ae 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -148,7 +148,7 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, } fsstack_copy_attr_times(dir, lower_dir); set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink); - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); out_unlock: dput(lower_dentry); inode_unlock(lower_dir); @@ -982,7 +982,7 @@ static int ecryptfs_getattr_link(struct mnt_idmap *idmap, mount_crypt_stat = &ecryptfs_superblock_to_private( dentry->d_sb)->mount_crypt_stat; - generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat); if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { char *target; size_t targetsiz; @@ -1011,7 +1011,8 @@ static int ecryptfs_getattr(struct mnt_idmap *idmap, if (!rc) { fsstack_copy_attr_all(d_inode(dentry), ecryptfs_inode_to_lower(d_inode(dentry))); - generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, + d_inode(dentry), stat); stat->blocks = lower_stat.blocks; } return rc; diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 373c3e5747e6..e2483acc4366 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -125,7 +125,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, /* This is a header extent */ char *page_virt; - page_virt = kmap_atomic(page); + page_virt = kmap_local_page(page); memset(page_virt, 0, PAGE_SIZE); /* TODO: Support more than one header extent */ if (view_extent_num == 0) { @@ -138,7 +138,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, crypt_stat, &written); } - kunmap_atomic(page_virt); + kunmap_local(page_virt); flush_dcache_page(page); if (rc) { printk(KERN_ERR "%s: Error reading xattr " @@ -255,7 +255,6 @@ out: * @mapping: The eCryptfs object * @pos: The file offset at which to start writing * @len: Length of the write - * @flags: Various flags * @pagep: Pointer to return the page * @fsdata: Pointer to return fs data (unused) * diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 60bdcaddcbe5..3458f153a588 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -64,11 +64,11 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, offset = ((((loff_t)page_for_lower->index) << PAGE_SHIFT) + offset_in_page); - virt = kmap(page_for_lower); + virt = kmap_local_page(page_for_lower); rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size); if (rc > 0) rc = 0; - kunmap(page_for_lower); + kunmap_local(virt); return rc; } @@ -140,7 +140,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, ecryptfs_page_idx, rc); goto out; } - ecryptfs_page_virt = kmap_atomic(ecryptfs_page); + ecryptfs_page_virt = kmap_local_page(ecryptfs_page); /* * pos: where we're now writing, offset: where the request was @@ -163,7 +163,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, (data + data_offset), num_bytes); data_offset += num_bytes; } - kunmap_atomic(ecryptfs_page_virt); + kunmap_local(ecryptfs_page_virt); flush_dcache_page(ecryptfs_page); SetPageUptodate(ecryptfs_page); unlock_page(ecryptfs_page); @@ -253,11 +253,11 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, int rc; offset = ((((loff_t)page_index) << PAGE_SHIFT) + offset_in_page); - virt = kmap(page_for_ecryptfs); + virt = kmap_local_page(page_for_ecryptfs); rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode); if (rc > 0) rc = 0; - kunmap(page_for_ecryptfs); + kunmap_local(virt); flush_dcache_page(page_for_ecryptfs); return rc; } diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index d57ee15874f9..59b52718a3a2 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -51,7 +51,7 @@ static ssize_t efivarfs_file_write(struct file *file, } else { inode_lock(inode); i_size_write(inode, datasize + sizeof(attributes)); - inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); inode_unlock(inode); } diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index b973a2c03dde..db9231f0e77b 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -25,7 +25,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb, if (inode) { inode->i_ino = get_next_ino(); inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); inode->i_flags = is_removable ? 0 : S_IMMUTABLE; switch (mode & S_IFMT) { case S_IFREG: diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index e028fafa04f3..996271473609 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -32,10 +32,16 @@ static int efivarfs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 storage_space, remaining_space, max_variable_size; efi_status_t status; - status = efivar_query_variable_info(attr, &storage_space, &remaining_space, - &max_variable_size); - if (status != EFI_SUCCESS) - return efi_status_to_err(status); + /* Some UEFI firmware does not implement QueryVariableInfo() */ + storage_space = remaining_space = 0; + if (efi_rt_services_supported(EFI_RT_SUPPORTED_QUERY_VARIABLE_INFO)) { + status = efivar_query_variable_info(attr, &storage_space, + &remaining_space, + &max_variable_size); + if (status != EFI_SUCCESS && status != EFI_UNSUPPORTED) + pr_warn_ratelimited("query_variable_info() failed: 0x%lx\n", + status); + } /* * This is not a normal filesystem, so no point in pretending it has a block diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig index 2df1bac8b375..0833e533df9d 100644 --- a/fs/efs/Kconfig +++ b/fs/efs/Kconfig @@ -2,6 +2,7 @@ config EFS_FS tristate "EFS file system support (read only)" depends on BLOCK + select BUFFER_HEAD help EFS is an older file system used for non-ISO9660 CD-ROMs and hard disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer diff --git a/fs/efs/efs.h b/fs/efs/efs.h index 13a4d9622633..918d2b9abb76 100644 --- a/fs/efs/efs.h +++ b/fs/efs/efs.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Copyright (c) 1999 Al Smith + * Copyright (c) 1999 Al Smith, <Al.Smith@aeschi.ch.eu.org> * * Portions derived from work (c) 1995,1996 Christian Vogelgsang. * Portions derived from IRIX header files (c) 1988 Silicon Graphics @@ -19,9 +19,6 @@ #define EFS_VERSION "1.0a" -static const char cprt[] = "EFS: "EFS_VERSION" - (c) 1999 Al Smith <Al.Smith@aeschi.ch.eu.org>"; - - /* 1 block is 512 bytes */ #define EFS_BLOCKSIZE_BITS 9 #define EFS_BLOCKSIZE (1 << EFS_BLOCKSIZE_BITS) diff --git a/fs/efs/inode.c b/fs/efs/inode.c index 3ba94bb005a6..3789d22ba501 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -105,8 +105,8 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) inode->i_size = be32_to_cpu(efs_inode->di_size); inode->i_atime.tv_sec = be32_to_cpu(efs_inode->di_atime); inode->i_mtime.tv_sec = be32_to_cpu(efs_inode->di_mtime); - inode->i_ctime.tv_sec = be32_to_cpu(efs_inode->di_ctime); - inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0; + inode_set_ctime(inode, be32_to_cpu(efs_inode->di_ctime), 0); + inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0; /* this is the number of blocks in the file */ if (inode->i_size == 0) { diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index f259d92c9720..f6dc961e6c2b 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -38,6 +38,7 @@ config EROFS_FS_DEBUG config EROFS_FS_XATTR bool "EROFS extended attributes" depends on EROFS_FS + select XXHASH default y help Extended attributes are name:value pairs associated with inodes by @@ -99,6 +100,21 @@ config EROFS_FS_ZIP_LZMA If unsure, say N. +config EROFS_FS_ZIP_DEFLATE + bool "EROFS DEFLATE compressed data support" + depends on EROFS_FS_ZIP + select ZLIB_INFLATE + help + Saying Y here includes support for reading EROFS file systems + containing DEFLATE compressed data. It gives better compression + ratios than the default LZ4 format, while it costs more CPU + overhead. + + DEFLATE support is an experimental feature for now and so most + file systems will be readable without selecting this option. + + If unsure, say N. + config EROFS_FS_ONDEMAND bool "EROFS fscache-based on-demand read support" depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y) diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index a3a98fc3e481..994d0b9deddf 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -5,4 +5,5 @@ erofs-objs := super.o inode.o data.o namei.o dir.o utils.o sysfs.o erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o pcpubuf.o erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o +erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index b1b846504027..349c3316ae6b 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -94,4 +94,6 @@ extern const struct z_erofs_decompressor erofs_decompressors[]; /* prototypes for specific algorithms */ int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, struct page **pagepool); +int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool); #endif diff --git a/fs/erofs/data.c b/fs/erofs/data.c index db5e4b7636ec..0c2c99c58b5e 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -413,14 +413,14 @@ const struct address_space_operations erofs_raw_access_aops = { #ifdef CONFIG_FS_DAX static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf, - enum page_entry_size pe_size) + unsigned int order) { - return dax_iomap_fault(vmf, pe_size, NULL, NULL, &erofs_iomap_ops); + return dax_iomap_fault(vmf, order, NULL, NULL, &erofs_iomap_ops); } static vm_fault_t erofs_dax_fault(struct vm_fault *vmf) { - return erofs_dax_huge_fault(vmf, PE_SIZE_PTE); + return erofs_dax_huge_fault(vmf, 0); } static const struct vm_operations_struct erofs_dax_vm_ops = { diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index cfad1eac7fd9..332ec5f74002 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -379,4 +379,10 @@ const struct z_erofs_decompressor erofs_decompressors[] = { .name = "lzma" }, #endif +#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE + [Z_EROFS_COMPRESSION_DEFLATE] = { + .decompress = z_erofs_deflate_decompress, + .name = "deflate" + }, +#endif }; diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c new file mode 100644 index 000000000000..19e5bdeb30b6 --- /dev/null +++ b/fs/erofs/decompressor_deflate.c @@ -0,0 +1,247 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/module.h> +#include <linux/zlib.h> +#include "compress.h" + +struct z_erofs_deflate { + struct z_erofs_deflate *next; + struct z_stream_s z; + u8 bounce[PAGE_SIZE]; +}; + +static DEFINE_SPINLOCK(z_erofs_deflate_lock); +static unsigned int z_erofs_deflate_nstrms, z_erofs_deflate_avail_strms; +static struct z_erofs_deflate *z_erofs_deflate_head; +static DECLARE_WAIT_QUEUE_HEAD(z_erofs_deflate_wq); + +module_param_named(deflate_streams, z_erofs_deflate_nstrms, uint, 0444); + +void z_erofs_deflate_exit(void) +{ + /* there should be no running fs instance */ + while (z_erofs_deflate_avail_strms) { + struct z_erofs_deflate *strm; + + spin_lock(&z_erofs_deflate_lock); + strm = z_erofs_deflate_head; + if (!strm) { + spin_unlock(&z_erofs_deflate_lock); + continue; + } + z_erofs_deflate_head = NULL; + spin_unlock(&z_erofs_deflate_lock); + + while (strm) { + struct z_erofs_deflate *n = strm->next; + + vfree(strm->z.workspace); + kfree(strm); + --z_erofs_deflate_avail_strms; + strm = n; + } + } +} + +int __init z_erofs_deflate_init(void) +{ + /* by default, use # of possible CPUs instead */ + if (!z_erofs_deflate_nstrms) + z_erofs_deflate_nstrms = num_possible_cpus(); + + for (; z_erofs_deflate_avail_strms < z_erofs_deflate_nstrms; + ++z_erofs_deflate_avail_strms) { + struct z_erofs_deflate *strm; + + strm = kzalloc(sizeof(*strm), GFP_KERNEL); + if (!strm) + goto out_failed; + + /* XXX: in-kernel zlib cannot shrink windowbits currently */ + strm->z.workspace = vmalloc(zlib_inflate_workspacesize()); + if (!strm->z.workspace) { + kfree(strm); + goto out_failed; + } + + spin_lock(&z_erofs_deflate_lock); + strm->next = z_erofs_deflate_head; + z_erofs_deflate_head = strm; + spin_unlock(&z_erofs_deflate_lock); + } + return 0; + +out_failed: + pr_err("failed to allocate zlib workspace\n"); + z_erofs_deflate_exit(); + return -ENOMEM; +} + +int z_erofs_load_deflate_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_deflate_cfgs *dfl, int size) +{ + if (!dfl || size < sizeof(struct z_erofs_deflate_cfgs)) { + erofs_err(sb, "invalid deflate cfgs, size=%u", size); + return -EINVAL; + } + + if (dfl->windowbits > MAX_WBITS) { + erofs_err(sb, "unsupported windowbits %u", dfl->windowbits); + return -EOPNOTSUPP; + } + + erofs_info(sb, "EXPERIMENTAL DEFLATE feature in use. Use at your own risk!"); + return 0; +} + +int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool) +{ + const unsigned int nrpages_out = + PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + const unsigned int nrpages_in = + PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; + struct super_block *sb = rq->sb; + unsigned int insz, outsz, pofs; + struct z_erofs_deflate *strm; + u8 *kin, *kout = NULL; + bool bounced = false; + int no = -1, ni = 0, j = 0, zerr, err; + + /* 1. get the exact DEFLATE compressed size */ + kin = kmap_local_page(*rq->in); + err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in, + min_t(unsigned int, rq->inputsize, + sb->s_blocksize - rq->pageofs_in)); + if (err) { + kunmap_local(kin); + return err; + } + + /* 2. get an available DEFLATE context */ +again: + spin_lock(&z_erofs_deflate_lock); + strm = z_erofs_deflate_head; + if (!strm) { + spin_unlock(&z_erofs_deflate_lock); + wait_event(z_erofs_deflate_wq, READ_ONCE(z_erofs_deflate_head)); + goto again; + } + z_erofs_deflate_head = strm->next; + spin_unlock(&z_erofs_deflate_lock); + + /* 3. multi-call decompress */ + insz = rq->inputsize; + outsz = rq->outputsize; + zerr = zlib_inflateInit2(&strm->z, -MAX_WBITS); + if (zerr != Z_OK) { + err = -EIO; + goto failed_zinit; + } + + pofs = rq->pageofs_out; + strm->z.avail_in = min_t(u32, insz, PAGE_SIZE - rq->pageofs_in); + insz -= strm->z.avail_in; + strm->z.next_in = kin + rq->pageofs_in; + strm->z.avail_out = 0; + + while (1) { + if (!strm->z.avail_out) { + if (++no >= nrpages_out || !outsz) { + erofs_err(sb, "insufficient space for decompressed data"); + err = -EFSCORRUPTED; + break; + } + + if (kout) + kunmap_local(kout); + strm->z.avail_out = min_t(u32, outsz, PAGE_SIZE - pofs); + outsz -= strm->z.avail_out; + if (!rq->out[no]) { + rq->out[no] = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); + set_page_private(rq->out[no], + Z_EROFS_SHORTLIVED_PAGE); + } + kout = kmap_local_page(rq->out[no]); + strm->z.next_out = kout + pofs; + pofs = 0; + } + + if (!strm->z.avail_in && insz) { + if (++ni >= nrpages_in) { + erofs_err(sb, "invalid compressed data"); + err = -EFSCORRUPTED; + break; + } + + if (kout) { /* unlike kmap(), take care of the orders */ + j = strm->z.next_out - kout; + kunmap_local(kout); + } + kunmap_local(kin); + strm->z.avail_in = min_t(u32, insz, PAGE_SIZE); + insz -= strm->z.avail_in; + kin = kmap_local_page(rq->in[ni]); + strm->z.next_in = kin; + bounced = false; + if (kout) { + kout = kmap_local_page(rq->out[no]); + strm->z.next_out = kout + j; + } + } + + /* + * Handle overlapping: Use bounced buffer if the compressed + * data is under processing; Or use short-lived pages from the + * on-stack pagepool where pages share among the same request + * and not _all_ inplace I/O pages are needed to be doubled. + */ + if (!bounced && rq->out[no] == rq->in[ni]) { + memcpy(strm->bounce, strm->z.next_in, strm->z.avail_in); + strm->z.next_in = strm->bounce; + bounced = true; + } + + for (j = ni + 1; j < nrpages_in; ++j) { + struct page *tmppage; + + if (rq->out[no] != rq->in[j]) + continue; + + DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb), + rq->in[j])); + tmppage = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); + set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); + copy_highpage(tmppage, rq->in[j]); + rq->in[j] = tmppage; + } + + zerr = zlib_inflate(&strm->z, Z_SYNC_FLUSH); + if (zerr != Z_OK || !(outsz + strm->z.avail_out)) { + if (zerr == Z_OK && rq->partial_decoding) + break; + if (zerr == Z_STREAM_END && !outsz) + break; + erofs_err(sb, "failed to decompress %d in[%u] out[%u]", + zerr, rq->inputsize, rq->outputsize); + err = -EFSCORRUPTED; + break; + } + } + + if (zlib_inflateEnd(&strm->z) != Z_OK && !err) + err = -EIO; + if (kout) + kunmap_local(kout); +failed_zinit: + kunmap_local(kin); + /* 4. push back DEFLATE stream context to the global list */ + spin_lock(&z_erofs_deflate_lock); + strm->next = z_erofs_deflate_head; + z_erofs_deflate_head = strm; + spin_unlock(&z_erofs_deflate_lock); + wake_up(&z_erofs_deflate_wq); + return err; +} diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 2c7b16e340fe..a03ec70ba6f2 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -13,6 +13,7 @@ #define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001 #define EROFS_FEATURE_COMPAT_MTIME 0x00000002 +#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004 /* * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should @@ -81,7 +82,8 @@ struct erofs_super_block { __u8 xattr_prefix_count; /* # of long xattr name prefixes */ __le32 xattr_prefix_start; /* start of long xattr prefixes */ __le64 packed_nid; /* nid of the special packed inode */ - __u8 reserved2[24]; + __u8 xattr_filter_reserved; /* reserved for xattr name filter */ + __u8 reserved2[23]; }; /* @@ -200,7 +202,7 @@ struct erofs_inode_extended { * for read-only fs, no need to introduce h_refcount */ struct erofs_xattr_ibody_header { - __le32 h_reserved; + __le32 h_name_filter; /* bit value 1 indicates not-present */ __u8 h_shared_count; __u8 h_reserved2[7]; __le32 h_shared_xattrs[]; /* shared xattr id array */ @@ -221,6 +223,10 @@ struct erofs_xattr_ibody_header { #define EROFS_XATTR_LONG_PREFIX 0x80 #define EROFS_XATTR_LONG_PREFIX_MASK 0x7f +#define EROFS_XATTR_FILTER_BITS 32 +#define EROFS_XATTR_FILTER_DEFAULT UINT32_MAX +#define EROFS_XATTR_FILTER_SEED 0x25BBE08F + /* xattr entry (for both inline & shared xattrs) */ struct erofs_xattr_entry { __u8 e_name_len; /* length of name */ @@ -289,6 +295,7 @@ struct erofs_dirent { enum { Z_EROFS_COMPRESSION_LZ4 = 0, Z_EROFS_COMPRESSION_LZMA = 1, + Z_EROFS_COMPRESSION_DEFLATE = 2, Z_EROFS_COMPRESSION_MAX }; #define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1) @@ -309,6 +316,12 @@ struct z_erofs_lzma_cfgs { #define Z_EROFS_LZMA_MAX_DICT_SIZE (8 * Z_EROFS_PCLUSTER_MAX_SIZE) +/* 6 bytes (+ length field = 8 bytes) */ +struct z_erofs_deflate_cfgs { + u8 windowbits; /* 8..15 for DEFLATE */ + u8 reserved[5]; +} __packed; + /* * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on) * e.g. for 4k logical cluster size, 4B if compacted 2B is off; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index e12592727a54..edc8ec7581b8 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -105,8 +105,8 @@ static void *erofs_read_inode(struct erofs_buf *buf, set_nlink(inode, le32_to_cpu(die->i_nlink)); /* extended inode has its own timestamp */ - inode->i_ctime.tv_sec = le64_to_cpu(die->i_mtime); - inode->i_ctime.tv_nsec = le32_to_cpu(die->i_mtime_nsec); + inode_set_ctime(inode, le64_to_cpu(die->i_mtime), + le32_to_cpu(die->i_mtime_nsec)); inode->i_size = le64_to_cpu(die->i_size); @@ -148,8 +148,7 @@ static void *erofs_read_inode(struct erofs_buf *buf, set_nlink(inode, le16_to_cpu(dic->i_nlink)); /* use build time for compact inodes */ - inode->i_ctime.tv_sec = sbi->build_time; - inode->i_ctime.tv_nsec = sbi->build_time_nsec; + inode_set_ctime(inode, sbi->build_time, sbi->build_time_nsec); inode->i_size = le32_to_cpu(dic->i_size); if (erofs_inode_is_data_compressed(vi->datalayout)) @@ -176,10 +175,7 @@ static void *erofs_read_inode(struct erofs_buf *buf, vi->chunkbits = sb->s_blocksize_bits + (vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK); } - inode->i_mtime.tv_sec = inode->i_ctime.tv_sec; - inode->i_atime.tv_sec = inode->i_ctime.tv_sec; - inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec; - inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec; + inode->i_mtime = inode->i_atime = inode_get_ctime(inode); inode->i_flags &= ~S_DAX; if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) && @@ -373,7 +369,7 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, stat->attributes_mask |= (STATX_ATTR_COMPRESSED | STATX_ATTR_IMMUTABLE); - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); return 0; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 36e32fa542f0..4ff88d0dd980 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -151,6 +151,7 @@ struct erofs_sb_info { u32 xattr_prefix_start; u8 xattr_prefix_count; struct erofs_xattr_prefix_item *xattr_prefixes; + unsigned int xattr_filter_reserved; #endif u16 device_id_mask; /* valid bits of device id to be used */ @@ -251,6 +252,7 @@ EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS) EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE) EROFS_FEATURE_FUNCS(xattr_prefixes, incompat, INCOMPAT_XATTR_PREFIXES) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) +EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER) /* atomic flag definitions */ #define EROFS_I_EA_INITED_BIT 0 @@ -270,6 +272,7 @@ struct erofs_inode { unsigned char inode_isize; unsigned int xattr_isize; + unsigned int xattr_name_filter; unsigned int xattr_shared_count; unsigned int *xattr_shared_xattrs; @@ -519,6 +522,26 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb, } #endif /* !CONFIG_EROFS_FS_ZIP_LZMA */ +#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE +int __init z_erofs_deflate_init(void); +void z_erofs_deflate_exit(void); +int z_erofs_load_deflate_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_deflate_cfgs *dfl, int size); +#else +static inline int z_erofs_deflate_init(void) { return 0; } +static inline int z_erofs_deflate_exit(void) { return 0; } +static inline int z_erofs_load_deflate_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_deflate_cfgs *dfl, int size) { + if (dfl) { + erofs_err(sb, "deflate algorithm isn't enabled"); + return -EINVAL; + } + return 0; +} +#endif /* !CONFIG_EROFS_FS_ZIP_DEFLATE */ + #ifdef CONFIG_EROFS_FS_ONDEMAND int erofs_fscache_register_fs(struct super_block *sb); void erofs_fscache_unregister_fs(struct super_block *sb); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 566f68ddfa36..44a24d573f1f 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -19,10 +19,8 @@ #include <trace/events/erofs.h> static struct kmem_cache *erofs_inode_cachep __read_mostly; -struct file_system_type erofs_fs_type; -void _erofs_err(struct super_block *sb, const char *function, - const char *fmt, ...) +void _erofs_err(struct super_block *sb, const char *func, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -32,12 +30,11 @@ void _erofs_err(struct super_block *sb, const char *function, vaf.fmt = fmt; vaf.va = &args; - pr_err("(device %s): %s: %pV", sb->s_id, function, &vaf); + pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf); va_end(args); } -void _erofs_info(struct super_block *sb, const char *function, - const char *fmt, ...) +void _erofs_info(struct super_block *sb, const char *func, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -102,11 +99,9 @@ static void erofs_free_inode(struct inode *inode) { struct erofs_inode *vi = EROFS_I(inode); - /* be careful of RCU symlink path */ if (inode->i_op == &erofs_fast_symlink_iops) kfree(inode->i_link); kfree(vi->xattr_shared_xattrs); - kmem_cache_free(erofs_inode_cachep, vi); } @@ -119,8 +114,7 @@ static bool check_layout_compatibility(struct super_block *sb, /* check if current kernel meets all mandatory requirements */ if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) { - erofs_err(sb, - "unidentified incompatible feature %x, please upgrade kernel version", + erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel", feature & ~EROFS_ALL_FEATURE_INCOMPAT); return false; } @@ -201,6 +195,9 @@ static int erofs_load_compr_cfgs(struct super_block *sb, case Z_EROFS_COMPRESSION_LZMA: ret = z_erofs_load_lzma_config(sb, dsb, data, size); break; + case Z_EROFS_COMPRESSION_DEFLATE: + ret = z_erofs_load_deflate_config(sb, dsb, data, size); + break; default: DBG_BUGON(1); ret = -EFAULT; @@ -388,6 +385,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); sbi->xattr_prefix_start = le32_to_cpu(dsb->xattr_prefix_start); sbi->xattr_prefix_count = dsb->xattr_prefix_count; + sbi->xattr_filter_reserved = dsb->xattr_filter_reserved; #endif sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); sbi->root_nid = le16_to_cpu(dsb->root_nid); @@ -420,16 +418,11 @@ static int erofs_read_superblock(struct super_block *sb) if (erofs_is_fscache_mode(sb)) erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!"); - if (erofs_sb_has_fragments(sbi)) - erofs_info(sb, "EXPERIMENTAL compressed fragments feature in use. Use at your own risk!"); - if (erofs_sb_has_dedupe(sbi)) - erofs_info(sb, "EXPERIMENTAL global deduplication feature in use. Use at your own risk!"); out: erofs_put_metabuf(&buf); return ret; } -/* set up default EROFS parameters */ static void erofs_default_options(struct erofs_fs_context *ctx) { #ifdef CONFIG_EROFS_FS_ZIP @@ -731,7 +724,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) xa_init(&sbi->managed_pslots); #endif - /* get the root inode */ inode = erofs_iget(sb, ROOT_NID(sbi)); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -748,7 +740,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return -ENOMEM; erofs_shrinker_register(sb); - /* sb->s_umount is already locked, SB_ACTIVE and SB_BORN are not set */ if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) { sbi->packed_inode = erofs_iget(sb, sbi->packed_nid); if (IS_ERR(sbi->packed_inode)) { @@ -881,10 +872,6 @@ static int erofs_init_fs_context(struct fs_context *fc) return 0; } -/* - * could be triggered after deactivate_locked_super() - * is called, thus including umount and failed to initialize. - */ static void erofs_kill_sb(struct super_block *sb) { struct erofs_sb_info *sbi; @@ -913,7 +900,6 @@ static void erofs_kill_sb(struct super_block *sb) sb->s_fs_info = NULL; } -/* called when ->s_root is non-NULL */ static void erofs_put_super(struct super_block *sb) { struct erofs_sb_info *const sbi = EROFS_SB(sb); @@ -950,9 +936,9 @@ static int __init erofs_module_init(void) erofs_check_ondisk_layout_definitions(); erofs_inode_cachep = kmem_cache_create("erofs_inode", - sizeof(struct erofs_inode), 0, - SLAB_RECLAIM_ACCOUNT, - erofs_inode_init_once); + sizeof(struct erofs_inode), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, + erofs_inode_init_once); if (!erofs_inode_cachep) return -ENOMEM; @@ -964,6 +950,10 @@ static int __init erofs_module_init(void) if (err) goto lzma_err; + err = z_erofs_deflate_init(); + if (err) + goto deflate_err; + erofs_pcpubuf_init(); err = z_erofs_init_zip_subsystem(); if (err) @@ -984,6 +974,8 @@ fs_err: sysfs_err: z_erofs_exit_zip_subsystem(); zip_err: + z_erofs_deflate_exit(); +deflate_err: z_erofs_lzma_exit(); lzma_err: erofs_exit_shrinker(); @@ -1001,13 +993,13 @@ static void __exit erofs_module_exit(void) erofs_exit_sysfs(); z_erofs_exit_zip_subsystem(); + z_erofs_deflate_exit(); z_erofs_lzma_exit(); erofs_exit_shrinker(); kmem_cache_destroy(erofs_inode_cachep); erofs_pcpubuf_exit(); } -/* get filesystem statistics */ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 40178b6e0688..09d341675e89 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -5,6 +5,7 @@ * Copyright (C) 2021-2022, Alibaba Cloud */ #include <linux/security.h> +#include <linux/xxhash.h> #include "xattr.h" struct erofs_xattr_iter { @@ -87,6 +88,7 @@ static int erofs_init_inode_xattrs(struct inode *inode) } ih = it.kaddr + erofs_blkoff(sb, it.pos); + vi->xattr_name_filter = le32_to_cpu(ih->h_name_filter); vi->xattr_shared_count = ih->h_shared_count; vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count, sizeof(uint), GFP_KERNEL); @@ -392,7 +394,10 @@ int erofs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size) { int ret; + unsigned int hashbit; struct erofs_xattr_iter it; + struct erofs_inode *vi = EROFS_I(inode); + struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); if (!name) return -EINVAL; @@ -401,6 +406,15 @@ int erofs_getxattr(struct inode *inode, int index, const char *name, if (ret) return ret; + /* reserved flag is non-zero if there's any change of on-disk format */ + if (erofs_sb_has_xattr_filter(sbi) && !sbi->xattr_filter_reserved) { + hashbit = xxh32(name, strlen(name), + EROFS_XATTR_FILTER_SEED + index); + hashbit &= EROFS_XATTR_FILTER_BITS - 1; + if (vi->xattr_name_filter & (1U << hashbit)) + return -ENOATTR; + } + it.index = index; it.name = (struct qstr)QSTR_INIT(name, strlen(name)); if (it.name.len > EROFS_NAME_LEN) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index de4f12152b62..036f610e044b 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -143,22 +143,17 @@ static inline void z_erofs_onlinepage_split(struct page *page) atomic_inc((atomic_t *)&page->private); } -static inline void z_erofs_page_mark_eio(struct page *page) +static void z_erofs_onlinepage_endio(struct page *page, int err) { - int orig; + int orig, v; + + DBG_BUGON(!PagePrivate(page)); do { orig = atomic_read((atomic_t *)&page->private); - } while (atomic_cmpxchg((atomic_t *)&page->private, orig, - orig | Z_EROFS_PAGE_EIO) != orig); -} - -static inline void z_erofs_onlinepage_endio(struct page *page) -{ - unsigned int v; + v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0); + } while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig); - DBG_BUGON(!PagePrivate(page)); - v = atomic_dec_return((atomic_t *)&page->private); if (!(v & ~Z_EROFS_PAGE_EIO)) { set_page_private(page, 0); ClearPagePrivate(page); @@ -507,19 +502,17 @@ enum z_erofs_pclustermode { */ Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, /* - * The current collection has been linked with the owned chain, and - * could also be linked with the remaining collections, which means - * if the processing page is the tail page of the collection, thus - * the current collection can safely use the whole page (since - * the previous collection is under control) for in-place I/O, as - * illustrated below: - * ________________________________________________________________ - * | tail (partial) page | head (partial) page | - * | (of the current cl) | (of the previous collection) | - * | | | - * |__PCLUSTER_FOLLOWED___|___________PCLUSTER_FOLLOWED____________| + * The pcluster was just linked to a decompression chain by us. It can + * also be linked with the remaining pclusters, which means if the + * processing page is the tail page of a pcluster, this pcluster can + * safely use the whole page (since the previous pcluster is within the + * same chain) for in-place I/O, as illustrated below: + * ___________________________________________________ + * | tail (partial) page | head (partial) page | + * | (of the current pcl) | (of the previous pcl) | + * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____| * - * [ (*) the above page can be used as inplace I/O. ] + * [ (*) the page above can be used as inplace I/O. ] */ Z_EROFS_PCLUSTER_FOLLOWED, }; @@ -535,8 +528,6 @@ struct z_erofs_decompress_frontend { z_erofs_next_pcluster_t owned_head; enum z_erofs_pclustermode mode; - /* used for applying cache strategy on the fly */ - bool backmost; erofs_off_t headoffset; /* a pointer used to pick up inplace I/O pages */ @@ -545,7 +536,7 @@ struct z_erofs_decompress_frontend { #define DECOMPRESS_FRONTEND_INIT(__i) { \ .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ - .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true } + .mode = Z_EROFS_PCLUSTER_FOLLOWED } static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) { @@ -554,7 +545,7 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) return false; - if (fe->backmost) + if (!(fe->map.m_flags & EROFS_MAP_FULL_MAPPED)) return true; if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND && @@ -851,9 +842,11 @@ err_out: return err; } -static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) +static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; + struct super_block *sb = fe->inode->i_sb; + erofs_blk_t blknr = erofs_blknr(sb, map->m_pa); struct erofs_workgroup *grp = NULL; int ret; @@ -863,8 +856,7 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); if (!(map->m_flags & EROFS_MAP_META)) { - grp = erofs_find_workgroup(fe->inode->i_sb, - map->m_pa >> PAGE_SHIFT); + grp = erofs_find_workgroup(sb, blknr); } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { DBG_BUGON(1); return -EFSCORRUPTED; @@ -883,9 +875,26 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) } else if (ret) { return ret; } + z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); - /* since file-backed online pages are traversed in reverse order */ + if (!z_erofs_is_inline_pcluster(fe->pcl)) { + /* bind cache first when cached decompression is preferred */ + z_erofs_bind_cache(fe); + } else { + void *mptr; + + mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP); + if (IS_ERR(mptr)) { + ret = PTR_ERR(mptr); + erofs_err(sb, "failed to get inline data %d", ret); + return ret; + } + get_page(map->buf.page); + WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page); + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; + } + /* file-backed inplace I/O pages are traversed in reverse order */ fe->icur = z_erofs_pclusterpages(fe->pcl); return 0; } @@ -908,12 +917,12 @@ void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) call_rcu(&pcl->rcu, z_erofs_rcu_callback); } -static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) +static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) { struct z_erofs_pcluster *pcl = fe->pcl; if (!pcl) - return false; + return; z_erofs_bvec_iter_end(&fe->biter); mutex_unlock(&pcl->lock); @@ -929,37 +938,29 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) erofs_workgroup_put(&pcl->obj); fe->pcl = NULL; - return true; } -static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, - struct page *page, unsigned int pageofs, - unsigned int len) +static int z_erofs_read_fragment(struct super_block *sb, struct page *page, + unsigned int cur, unsigned int end, erofs_off_t pos) { - struct super_block *sb = inode->i_sb; - struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode; + struct inode *packed_inode = EROFS_SB(sb)->packed_inode; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - u8 *src, *dst; - unsigned int i, cnt; + unsigned int cnt; + u8 *src; if (!packed_inode) return -EFSCORRUPTED; buf.inode = packed_inode; - pos += EROFS_I(inode)->z_fragmentoff; - for (i = 0; i < len; i += cnt) { - cnt = min_t(unsigned int, len - i, + for (; cur < end; cur += cnt, pos += cnt) { + cnt = min_t(unsigned int, end - cur, sb->s_blocksize - erofs_blkoff(sb, pos)); src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP); if (IS_ERR(src)) { erofs_put_metabuf(&buf); return PTR_ERR(src); } - - dst = kmap_local_page(page); - memcpy(dst + pageofs + i, src + erofs_blkoff(sb, pos), cnt); - kunmap_local(dst); - pos += cnt; + memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt); } erofs_put_metabuf(&buf); return 0; @@ -972,94 +973,60 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); bool tight = true, exclusive; - unsigned int cur, end, spiltted; + unsigned int cur, end, len, split; int err = 0; - /* register locked file pages as online pages in pack */ z_erofs_onlinepage_init(page); - spiltted = 0; + split = 0; end = PAGE_SIZE; repeat: - cur = end - 1; - - if (offset + cur < map->m_la || - offset + cur >= map->m_la + map->m_llen) { - if (z_erofs_collector_end(fe)) - fe->backmost = false; - map->m_la = offset + cur; + if (offset + end - 1 < map->m_la || + offset + end - 1 >= map->m_la + map->m_llen) { + z_erofs_pcluster_end(fe); + map->m_la = offset + end - 1; map->m_llen = 0; err = z_erofs_map_blocks_iter(inode, map, 0); if (err) goto out; - } else { - if (fe->pcl) - goto hitted; - /* didn't get a valid pcluster previously (very rare) */ } - if (!(map->m_flags & EROFS_MAP_MAPPED) || - map->m_flags & EROFS_MAP_FRAGMENT) - goto hitted; - - err = z_erofs_collector_begin(fe); - if (err) - goto out; - - if (z_erofs_is_inline_pcluster(fe->pcl)) { - void *mp; - - mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb, - erofs_blknr(inode->i_sb, map->m_pa), - EROFS_NO_KMAP); - if (IS_ERR(mp)) { - err = PTR_ERR(mp); - erofs_err(inode->i_sb, - "failed to get inline page, err %d", err); - goto out; - } - get_page(fe->map.buf.page); - WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, - fe->map.buf.page); - fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; - } else { - /* bind cache first when cached decompression is preferred */ - z_erofs_bind_cache(fe); - } -hitted: - /* - * Ensure the current partial page belongs to this submit chain rather - * than other concurrent submit chains or the noio(bypass) chain since - * those chains are handled asynchronously thus the page cannot be used - * for inplace I/O or bvpage (should be processed in a strict order.) - */ - tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); + cur = offset > map->m_la ? 0 : map->m_la - offset; + /* bump split parts first to avoid several separate cases */ + ++split; - cur = end - min_t(erofs_off_t, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { zero_user_segment(page, cur, end); + tight = false; goto next_part; } + if (map->m_flags & EROFS_MAP_FRAGMENT) { - unsigned int pageofs, skip, len; + erofs_off_t fpos = offset + cur - map->m_la; - if (offset > map->m_la) { - pageofs = 0; - skip = offset - map->m_la; - } else { - pageofs = map->m_la & ~PAGE_MASK; - skip = 0; - } - len = min_t(unsigned int, map->m_llen - skip, end - cur); - err = z_erofs_read_fragment(inode, skip, page, pageofs, len); + len = min_t(unsigned int, map->m_llen - fpos, end - cur); + err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len, + EROFS_I(inode)->z_fragmentoff + fpos); if (err) goto out; - ++spiltted; tight = false; goto next_part; } - exclusive = (!cur && (!spiltted || tight)); + if (!fe->pcl) { + err = z_erofs_pcluster_begin(fe); + if (err) + goto out; + } + + /* + * Ensure the current partial page belongs to this submit chain rather + * than other concurrent submit chains or the noio(bypass) chain since + * those chains are handled asynchronously thus the page cannot be used + * for inplace I/O or bvpage (should be processed in a strict order.) + */ + tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); + exclusive = (!cur && ((split <= 1) || tight)); if (cur) tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); @@ -1072,8 +1039,6 @@ hitted: goto out; z_erofs_onlinepage_split(page); - /* bump up the number of spiltted parts of a page */ - ++spiltted; if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) fe->pcl->multibases = true; if (fe->pcl->length < offset + end - map->m_la) { @@ -1094,9 +1059,7 @@ next_part: goto repeat; out: - if (err) - z_erofs_page_mark_eio(page); - z_erofs_onlinepage_endio(page); + z_erofs_onlinepage_endio(page, err); return err; } @@ -1199,9 +1162,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, cur += len; } kunmap_local(dst); - if (err) - z_erofs_page_mark_eio(bvi->bvec.page); - z_erofs_onlinepage_endio(bvi->bvec.page); + z_erofs_onlinepage_endio(bvi->bvec.page, err); list_del(p); kfree(bvi); } @@ -1372,9 +1333,7 @@ out: /* recycle all individual short-lived pages */ if (z_erofs_put_shortlivedpage(be->pagepool, page)) continue; - if (err) - z_erofs_page_mark_eio(page); - z_erofs_onlinepage_endio(page); + z_erofs_onlinepage_endio(page, err); } if (be->decompressed_pages != be->onstack_pages) @@ -1410,7 +1369,10 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, owned = READ_ONCE(be.pcl->next); z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0); - erofs_workgroup_put(&be.pcl->obj); + if (z_erofs_is_inline_pcluster(be.pcl)) + z_erofs_free_pcluster(be.pcl); + else + erofs_workgroup_put(&be.pcl->obj); } } @@ -1848,15 +1810,10 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, page = erofs_grab_cache_page_nowait(inode->i_mapping, index); if (page) { - if (PageUptodate(page)) { + if (PageUptodate(page)) unlock_page(page); - } else { - err = z_erofs_do_read_page(f, page); - if (err) - erofs_err(inode->i_sb, - "readmore error at page %lu @ nid %llu", - index, EROFS_I(inode)->nid); - } + else + (void)z_erofs_do_read_page(f, page); put_page(page); } @@ -1868,25 +1825,25 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, static int z_erofs_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *const inode = page->mapping->host; + struct inode *const inode = folio->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); int err; - trace_erofs_readpage(page, false); - f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; + trace_erofs_read_folio(folio, false); + f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT; z_erofs_pcluster_readmore(&f, NULL, true); - err = z_erofs_do_read_page(&f, page); + err = z_erofs_do_read_page(&f, &folio->page); z_erofs_pcluster_readmore(&f, NULL, false); - (void)z_erofs_collector_end(&f); + z_erofs_pcluster_end(&f); /* if some compressed cluster ready, need submit them anyway */ z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false); - if (err) - erofs_err(inode->i_sb, "failed to read, err [%d]", err); + if (err && err != -EINTR) + erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu", + err, folio->index, EROFS_I(inode)->nid); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); @@ -1898,38 +1855,35 @@ static void z_erofs_readahead(struct readahead_control *rac) struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - struct page *head = NULL, *page; - unsigned int nr_pages; + struct folio *head = NULL, *folio; + unsigned int nr_folios; + int err; f.headoffset = readahead_pos(rac); z_erofs_pcluster_readmore(&f, rac, true); - nr_pages = readahead_count(rac); - trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); + nr_folios = readahead_count(rac); + trace_erofs_readpages(inode, readahead_index(rac), nr_folios, false); - while ((page = readahead_page(rac))) { - set_page_private(page, (unsigned long)head); - head = page; + while ((folio = readahead_folio(rac))) { + folio->private = head; + head = folio; } + /* traverse in reverse order for best metadata I/O performance */ while (head) { - struct page *page = head; - int err; - - /* traversal in reverse order */ - head = (void *)page_private(page); + folio = head; + head = folio_get_private(folio); - err = z_erofs_do_read_page(&f, page); - if (err) - erofs_err(inode->i_sb, - "readahead error at page %lu @ nid %llu", - page->index, EROFS_I(inode)->nid); - put_page(page); + err = z_erofs_do_read_page(&f, &folio->page); + if (err && err != -EINTR) + erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", + folio->index, EROFS_I(inode)->nid); } z_erofs_pcluster_readmore(&f, rac, false); - (void)z_erofs_collector_end(&f); + z_erofs_pcluster_end(&f); - z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_pages), true); + z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_folios), true); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); } diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 1909ddafd9c7..7b55111fd533 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -561,8 +561,9 @@ static int z_erofs_do_map_blocks(struct inode *inode, if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && - map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA && - map->m_llen >= i_blocksize(inode))) { + (map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA || + map->m_algorithmformat == Z_EROFS_COMPRESSION_DEFLATE) && + map->m_llen >= i_blocksize(inode))) { err = z_erofs_get_extent_decompressedlen(&m); if (!err) map->m_flags |= EROFS_MAP_FULL_MAPPED; diff --git a/fs/eventfd.c b/fs/eventfd.c index 8aa36cd37351..33a918f9566c 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -189,7 +189,7 @@ void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) { lockdep_assert_held(&ctx->wqh.lock); - *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; + *cnt = ((ctx->flags & EFD_SEMAPHORE) && ctx->count) ? 1 : ctx->count; ctx->count -= *cnt; } EXPORT_SYMBOL_GPL(eventfd_ctx_do_read); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4b1b3362f697..1d9a71a0c4c1 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -975,15 +975,11 @@ again: static int ep_alloc(struct eventpoll **pep) { - int error; - struct user_struct *user; struct eventpoll *ep; - user = get_current_user(); - error = -ENOMEM; ep = kzalloc(sizeof(*ep), GFP_KERNEL); if (unlikely(!ep)) - goto free_uid; + return -ENOMEM; mutex_init(&ep->mtx); rwlock_init(&ep->lock); @@ -992,16 +988,12 @@ static int ep_alloc(struct eventpoll **pep) INIT_LIST_HEAD(&ep->rdllist); ep->rbr = RB_ROOT_CACHED; ep->ovflist = EP_UNACTIVE_PTR; - ep->user = user; + ep->user = get_current_user(); refcount_set(&ep->refcount, 1); *pep = ep; return 0; - -free_uid: - free_uid(user); - return error; } /* diff --git a/fs/exec.c b/fs/exec.c index 1a827d55ba94..6518e33ea813 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -701,6 +701,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) if (vma != vma_next(&vmi)) return -EFAULT; + vma_iter_prev_range(&vmi); /* * cover the whole range: [new_start, old_end) */ @@ -1276,8 +1277,8 @@ int begin_new_exec(struct linux_binprm * bprm) /* * Must be called _before_ exec_mmap() as bprm->mm is - * not visible until then. This also enables the update - * to be lockless. + * not visible until then. Doing it here also ensures + * we don't race against replace_mm_exe_file(). */ retval = set_mm_exe_file(bprm->mm, bprm->file); if (retval) diff --git a/fs/exfat/Kconfig b/fs/exfat/Kconfig index 147edeb04469..cbeca8e44d9b 100644 --- a/fs/exfat/Kconfig +++ b/fs/exfat/Kconfig @@ -2,6 +2,7 @@ config EXFAT_FS tristate "exFAT filesystem support" + select BUFFER_HEAD select NLS select LEGACY_DIRECT_IO help diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 729ada9e26e8..f55498e5c23d 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -273,8 +273,6 @@ struct exfat_sb_info { spinlock_t inode_hash_lock; struct hlist_head inode_hashtable[EXFAT_HASH_SIZE]; - - struct rcu_head rcu; }; #define EXFAT_CACHE_VALID 0 diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 3cbd270e0cba..32395ef686a2 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -22,7 +22,7 @@ static int exfat_cont_expand(struct inode *inode, loff_t size) if (err) return err; - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); if (!IS_SYNC(inode)) @@ -232,7 +232,7 @@ int exfat_getattr(struct mnt_idmap *idmap, const struct path *path, struct inode *inode = d_backing_inode(path->dentry); struct exfat_inode_info *ei = EXFAT_I(inode); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); exfat_truncate_atime(&stat->atime); stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ei->i_crtime.tv_sec; @@ -290,7 +290,7 @@ int exfat_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } if (attr->ia_valid & ATTR_SIZE) - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); setattr_copy(&nop_mnt_idmap, inode, attr); exfat_truncate_atime(&inode->i_atime); diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 481dd338f2b8..13329baeafbc 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -355,7 +355,7 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to) if (to > i_size_read(inode)) { truncate_pagecache(inode, i_size_read(inode)); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); exfat_truncate(inode); } } @@ -398,7 +398,7 @@ static int exfat_write_end(struct file *file, struct address_space *mapping, exfat_write_failed(mapping, pos+len); if (!(err < 0) && !(ei->attr & ATTR_ARCHIVE)) { - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); ei->attr |= ATTR_ARCHIVE; mark_inode_dirty(inode); } @@ -577,7 +577,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9; inode->i_mtime = info->mtime; - inode->i_ctime = info->mtime; + inode_set_ctime_to_ts(inode, info->mtime); ei->i_crtime = info->crtime; inode->i_atime = info->atime; diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index e0ff9d156f6f..1b9f587f6cca 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -569,7 +569,7 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir, goto unlock; inode_inc_iversion(dir); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); if (IS_DIRSYNC(dir)) exfat_sync_inode(dir); else @@ -582,8 +582,7 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir, goto unlock; inode_inc_iversion(inode); - inode->i_mtime = inode->i_atime = inode->i_ctime = - EXFAT_I(inode)->i_crtime = current_time(inode); + inode->i_mtime = inode->i_atime = EXFAT_I(inode)->i_crtime = inode_set_ctime_current(inode); exfat_truncate_atime(&inode->i_atime); /* timestamp is already written, so mark_inode_dirty() is unneeded. */ @@ -817,7 +816,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry) ei->dir.dir = DIR_DELETED; inode_inc_iversion(dir); - dir->i_mtime = dir->i_atime = current_time(dir); + dir->i_mtime = dir->i_atime = inode_set_ctime_current(dir); exfat_truncate_atime(&dir->i_atime); if (IS_DIRSYNC(dir)) exfat_sync_inode(dir); @@ -825,7 +824,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry) mark_inode_dirty(dir); clear_nlink(inode); - inode->i_mtime = inode->i_atime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); exfat_truncate_atime(&inode->i_atime); exfat_unhash_inode(inode); exfat_d_version_set(dentry, inode_query_iversion(dir)); @@ -852,7 +851,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, goto unlock; inode_inc_iversion(dir); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); if (IS_DIRSYNC(dir)) exfat_sync_inode(dir); else @@ -866,8 +865,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, goto unlock; inode_inc_iversion(inode); - inode->i_mtime = inode->i_atime = inode->i_ctime = - EXFAT_I(inode)->i_crtime = current_time(inode); + inode->i_mtime = inode->i_atime = EXFAT_I(inode)->i_crtime = inode_set_ctime_current(inode); exfat_truncate_atime(&inode->i_atime); /* timestamp is already written, so mark_inode_dirty() is unneeded. */ @@ -979,7 +977,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) ei->dir.dir = DIR_DELETED; inode_inc_iversion(dir); - dir->i_mtime = dir->i_atime = current_time(dir); + dir->i_mtime = dir->i_atime = inode_set_ctime_current(dir); exfat_truncate_atime(&dir->i_atime); if (IS_DIRSYNC(dir)) exfat_sync_inode(dir); @@ -988,7 +986,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) drop_nlink(dir); clear_nlink(inode); - inode->i_mtime = inode->i_atime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); exfat_truncate_atime(&inode->i_atime); exfat_unhash_inode(inode); exfat_d_version_set(dentry, inode_query_iversion(dir)); @@ -1312,8 +1310,8 @@ static int exfat_rename(struct mnt_idmap *idmap, goto unlock; inode_inc_iversion(new_dir); - new_dir->i_ctime = new_dir->i_mtime = new_dir->i_atime = - EXFAT_I(new_dir)->i_crtime = current_time(new_dir); + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + EXFAT_I(new_dir)->i_crtime = current_time(new_dir); exfat_truncate_atime(&new_dir->i_atime); if (IS_DIRSYNC(new_dir)) exfat_sync_inode(new_dir); @@ -1336,7 +1334,6 @@ static int exfat_rename(struct mnt_idmap *idmap, } inode_inc_iversion(old_dir); - old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); if (IS_DIRSYNC(old_dir)) exfat_sync_inode(old_dir); else @@ -1354,8 +1351,7 @@ static int exfat_rename(struct mnt_idmap *idmap, exfat_warn(sb, "abnormal access to an inode dropped"); WARN_ON(new_inode->i_nlink == 0); } - new_inode->i_ctime = EXFAT_I(new_inode)->i_crtime = - current_time(new_inode); + EXFAT_I(new_inode)->i_crtime = current_time(new_inode); } unlock: diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 8c32460e031e..2778bd9b631e 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -31,16 +31,6 @@ static void exfat_free_iocharset(struct exfat_sb_info *sbi) kfree(sbi->options.iocharset); } -static void exfat_delayed_free(struct rcu_head *p) -{ - struct exfat_sb_info *sbi = container_of(p, struct exfat_sb_info, rcu); - - unload_nls(sbi->nls_io); - exfat_free_iocharset(sbi); - exfat_free_upcase_table(sbi); - kfree(sbi); -} - static void exfat_put_super(struct super_block *sb) { struct exfat_sb_info *sbi = EXFAT_SB(sb); @@ -50,7 +40,8 @@ static void exfat_put_super(struct super_block *sb) brelse(sbi->boot_bh); mutex_unlock(&sbi->s_lock); - call_rcu(&sbi->rcu, exfat_delayed_free); + unload_nls(sbi->nls_io); + exfat_free_upcase_table(sbi); } static int exfat_sync_fs(struct super_block *sb, int wait) @@ -379,8 +370,7 @@ static int exfat_read_root(struct inode *inode) ei->i_size_ondisk = i_size_read(inode); exfat_save_attr(inode, ATTR_SUBDIR); - inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = - current_time(inode); + inode->i_mtime = inode->i_atime = ei->i_crtime = inode_set_ctime_current(inode); exfat_truncate_atime(&inode->i_atime); return 0; } @@ -710,9 +700,6 @@ free_table: check_nls_io: unload_nls(sbi->nls_io); - exfat_free_iocharset(sbi); - sb->s_fs_info = NULL; - kfree(sbi); return err; } @@ -721,14 +708,18 @@ static int exfat_get_tree(struct fs_context *fc) return get_tree_bdev(fc, exfat_fill_super); } +static void exfat_free_sbi(struct exfat_sb_info *sbi) +{ + exfat_free_iocharset(sbi); + kfree(sbi); +} + static void exfat_free(struct fs_context *fc) { struct exfat_sb_info *sbi = fc->s_fs_info; - if (sbi) { - exfat_free_iocharset(sbi); - kfree(sbi); - } + if (sbi) + exfat_free_sbi(sbi); } static int exfat_reconfigure(struct fs_context *fc) @@ -773,12 +764,21 @@ static int exfat_init_fs_context(struct fs_context *fc) return 0; } +static void exfat_kill_sb(struct super_block *sb) +{ + struct exfat_sb_info *sbi = sb->s_fs_info; + + kill_block_super(sb); + if (sbi) + exfat_free_sbi(sbi); +} + static struct file_system_type exfat_fs_type = { .owner = THIS_MODULE, .name = "exfat", .init_fs_context = exfat_init_fs_context, .parameters = exfat_parameters, - .kill_sb = kill_block_super, + .kill_sb = exfat_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index d1dbe47c7975..c20704aa21b3 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -386,6 +386,7 @@ static int export_encode_fh(struct inode *inode, struct fid *fid, * @inode: the object to encode * @fid: where to store the file handle fragment * @max_len: maximum length to store there + * @parent: parent directory inode, if wanted * @flags: properties of the requested file handle * * Returns an enum fid_type or a negative errno. diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig index 77393fda99af..74d98965902e 100644 --- a/fs/ext2/Kconfig +++ b/fs/ext2/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config EXT2_FS tristate "Second extended fs support" + select BUFFER_HEAD select FS_IOMAP select LEGACY_DIRECT_IO help diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 82b17d7fc93f..7e54c31589c7 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -237,7 +237,7 @@ ext2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, error = __ext2_set_acl(inode, acl, type); if (!error && update_mode) { inode->i_mode = mode; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); } return error; diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index eca60b747c6b..e124f3d709b2 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -36,8 +36,6 @@ */ -#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) - struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb, unsigned int block_group, struct buffer_head ** bh) @@ -474,8 +472,8 @@ void ext2_discard_reservation(struct inode *inode) * @block: start physical block to free * @count: number of blocks to free */ -void ext2_free_blocks (struct inode * inode, unsigned long block, - unsigned long count) +void ext2_free_blocks(struct inode * inode, ext2_fsblk_t block, + unsigned long count) { struct buffer_head *bitmap_bh = NULL; struct buffer_head * bh2; @@ -718,36 +716,34 @@ fail_access: } /** - * find_next_reservable_window(): - * find a reservable space within the given range. - * It does not allocate the reservation window for now: - * alloc_new_reservation() will do the work later. - * - * @search_head: the head of the searching list; - * This is not necessarily the list head of the whole filesystem + * find_next_reservable_window - Find a reservable space within the given range. + * @search_head: The list to search. + * @my_rsv: The reservation we're currently using. + * @sb: The super block. + * @start_block: The first block we consider to start the real search from + * @last_block: The maximum block number that our goal reservable space + * could start from. * - * We have both head and start_block to assist the search - * for the reservable space. The list starts from head, - * but we will shift to the place where start_block is, - * then start from there, when looking for a reservable space. + * It does not allocate the reservation window: alloc_new_reservation() + * will do the work later. * - * @sb: the super block. + * We search the given range, rather than the whole reservation double + * linked list, (start_block, last_block) to find a free region that is + * of my size and has not been reserved. * - * @start_block: the first block we consider to start the real search from + * @search_head is not necessarily the list head of the whole filesystem. + * We have both head and @start_block to assist the search for the + * reservable space. The list starts from head, but we will shift to + * the place where start_block is, then start from there, when looking + * for a reservable space. * - * @last_block: - * the maximum block number that our goal reservable space - * could start from. This is normally the last block in this - * group. The search will end when we found the start of next - * possible reservable space is out of this boundary. - * This could handle the cross boundary reservation window - * request. - * - * basically we search from the given range, rather than the whole - * reservation double linked list, (start_block, last_block) - * to find a free region that is of my size and has not - * been reserved. + * @last_block is normally the last block in this group. The search will end + * when we found the start of next possible reservable space is out + * of this boundary. This could handle the cross boundary reservation + * window request. * + * Return: -1 if we could not find a range of sufficient size. If we could, + * return 0 and fill in @my_rsv with the range information. */ static int find_next_reservable_window( struct ext2_reserve_window_node *search_head, @@ -835,41 +831,34 @@ static int find_next_reservable_window( } /** - * alloc_new_reservation()--allocate a new reservation window - * - * To make a new reservation, we search part of the filesystem - * reservation list (the list that inside the group). We try to - * allocate a new reservation window near the allocation goal, - * or the beginning of the group, if there is no goal. + * alloc_new_reservation - Allocate a new reservation window. + * @my_rsv: The reservation we're currently using. + * @grp_goal: The goal block relative to the start of the group. + * @sb: The super block. + * @group: The group we are trying to allocate in. + * @bitmap_bh: The block group block bitmap. * - * We first find a reservable space after the goal, then from - * there, we check the bitmap for the first free block after - * it. If there is no free block until the end of group, then the - * whole group is full, we failed. Otherwise, check if the free - * block is inside the expected reservable space, if so, we - * succeed. - * If the first free block is outside the reservable space, then - * start from the first free block, we search for next available - * space, and go on. + * To make a new reservation, we search part of the filesystem reservation + * list (the list inside the group). We try to allocate a new + * reservation window near @grp_goal, or the beginning of the + * group, if @grp_goal is negative. * - * on succeed, a new reservation will be found and inserted into the list - * It contains at least one free block, and it does not overlap with other - * reservation windows. + * We first find a reservable space after the goal, then from there, + * we check the bitmap for the first free block after it. If there is + * no free block until the end of group, then the whole group is full, + * we failed. Otherwise, check if the free block is inside the expected + * reservable space, if so, we succeed. * - * failed: we failed to find a reservation window in this group + * If the first free block is outside the reservable space, then start + * from the first free block, we search for next available space, and + * go on. * - * @my_rsv: the reservation - * - * @grp_goal: The goal (group-relative). It is where the search for a - * free reservable space should start from. - * if we have a goal(goal >0 ), then start from there, - * no goal(goal = -1), we start from the first block - * of the group. - * - * @sb: the super block - * @group: the group we are trying to allocate in - * @bitmap_bh: the block group block bitmap + * on succeed, a new reservation will be found and inserted into the + * list. It contains at least one free block, and it does not overlap + * with other reservation windows. * + * Return: 0 on success, -1 if we failed to find a reservation window + * in this group */ static int alloc_new_reservation(struct ext2_reserve_window_node *my_rsv, ext2_grpblk_t grp_goal, struct super_block *sb, @@ -1133,8 +1122,13 @@ ext2_try_to_allocate_with_rsv(struct super_block *sb, unsigned int group, if ((my_rsv->rsv_start > group_last_block) || (my_rsv->rsv_end < group_first_block)) { + ext2_error(sb, __func__, + "Reservation out of group %u range goal %d fsb[%lu,%lu] rsv[%lu, %lu]", + group, grp_goal, group_first_block, + group_last_block, my_rsv->rsv_start, + my_rsv->rsv_end); rsv_window_dump(&EXT2_SB(sb)->s_rsv_window_root, 1); - BUG(); + return -1; } ret = ext2_try_to_allocate(sb, group, bitmap_bh, grp_goal, &num, &my_rsv->rsv_window); @@ -1195,6 +1189,7 @@ int ext2_data_block_valid(struct ext2_sb_info *sbi, ext2_fsblk_t start_blk, * @goal: given target block(filesystem wide) * @count: target number of blocks to allocate * @errp: error code + * @flags: allocate flags * * ext2_new_blocks uses a goal block to assist allocation. If the goal is * free, or there is a free block within 32 blocks of the goal, that block @@ -1204,7 +1199,7 @@ int ext2_data_block_valid(struct ext2_sb_info *sbi, ext2_fsblk_t start_blk, * This function also updates quota and i_blocks field. */ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal, - unsigned long *count, int *errp) + unsigned long *count, int *errp, unsigned int flags) { struct buffer_head *bitmap_bh = NULL; struct buffer_head *gdp_bh; @@ -1243,15 +1238,15 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal, es = EXT2_SB(sb)->s_es; ext2_debug("goal=%lu.\n", goal); /* - * Allocate a block from reservation only when - * filesystem is mounted with reservation(default,-o reservation), and - * it's a regular file, and - * the desired window size is greater than 0 (One could use ioctl - * command EXT2_IOC_SETRSVSZ to set the window size to 0 to turn off - * reservation on that particular file) + * Allocate a block from reservation only when the filesystem is + * mounted with reservation(default,-o reservation), and it's a regular + * file, and the desired window size is greater than 0 (One could use + * ioctl command EXT2_IOC_SETRSVSZ to set the window size to 0 to turn + * off reservation on that particular file). Also do not use the + * reservation window if the caller asked us not to do it. */ block_i = EXT2_I(inode)->i_block_alloc_info; - if (block_i) { + if (!(flags & EXT2_ALLOC_NORESERVE) && block_i) { windowsz = block_i->rsv_window_node.rsv_goal_size; if (windowsz > 0) my_rsv = &block_i->rsv_window_node; @@ -1431,13 +1426,6 @@ out: return 0; } -ext2_fsblk_t ext2_new_block(struct inode *inode, unsigned long goal, int *errp) -{ - unsigned long count = 1; - - return ext2_new_blocks(inode, goal, &count, errp); -} - #ifdef EXT2FS_DEBUG unsigned long ext2_count_free(struct buffer_head *map, unsigned int numchars) diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 42db804794bd..b335f17f682f 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -468,7 +468,7 @@ int ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, ext2_set_de_type(de, inode); ext2_commit_chunk(page, pos, len); if (update_times) - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(dir); return ext2_handle_dirsync(dir); @@ -555,7 +555,7 @@ got_it: de->inode = cpu_to_le32(inode->i_ino); ext2_set_de_type (de, inode); ext2_commit_chunk(page, pos, rec_len); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(dir); err = ext2_handle_dirsync(dir); @@ -606,7 +606,7 @@ int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page) pde->rec_len = ext2_rec_len_to_disk(to - from); dir->inode = 0; ext2_commit_chunk(page, pos, to - from); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(inode); return ext2_handle_dirsync(inode); diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 35a041c47c38..7fdd685c384d 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -399,6 +399,12 @@ struct ext2_inode { #define EXT2_ERRORS_DEFAULT EXT2_ERRORS_CONTINUE /* + * Allocation flags + */ +#define EXT2_ALLOC_NORESERVE 0x1 /* Do not use reservation + * window for allocation */ + +/* * Structure of the super block */ struct ext2_super_block { @@ -695,13 +701,11 @@ static inline struct ext2_inode_info *EXT2_I(struct inode *inode) /* balloc.c */ extern int ext2_bg_has_super(struct super_block *sb, int group); extern unsigned long ext2_bg_num_gdb(struct super_block *sb, int group); -extern ext2_fsblk_t ext2_new_block(struct inode *, unsigned long, int *); -extern ext2_fsblk_t ext2_new_blocks(struct inode *, unsigned long, - unsigned long *, int *); +extern ext2_fsblk_t ext2_new_blocks(struct inode *, ext2_fsblk_t, + unsigned long *, int *, unsigned int); extern int ext2_data_block_valid(struct ext2_sb_info *sbi, ext2_fsblk_t start_blk, unsigned int count); -extern void ext2_free_blocks (struct inode *, unsigned long, - unsigned long); +extern void ext2_free_blocks(struct inode *, ext2_fsblk_t, unsigned long); extern unsigned long ext2_count_free_blocks (struct super_block *); extern unsigned long ext2_count_dirs (struct super_block *); extern struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb, diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 0b4c91c62e1f..1039e5bf90af 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -103,7 +103,7 @@ static vm_fault_t ext2_dax_fault(struct vm_fault *vmf) } filemap_invalidate_lock_shared(inode->i_mapping); - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, NULL, &ext2_iomap_ops); + ret = dax_iomap_fault(vmf, 0, NULL, NULL, &ext2_iomap_ops); filemap_invalidate_unlock_shared(inode->i_mapping); if (write) diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index a4e1d7a9c544..c24d0de95a83 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -273,7 +273,6 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) if ((parent == d_inode(sb->s_root)) || (EXT2_I(parent)->i_flags & EXT2_TOPDIR_FL)) { - struct ext2_group_desc *best_desc = NULL; int best_ndir = inodes_per_group; int best_group = -1; @@ -291,10 +290,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) continue; best_group = group; best_ndir = le16_to_cpu(desc->bg_used_dirs_count); - best_desc = desc; } if (best_group >= 0) { - desc = best_desc; group = best_group; goto found; } @@ -549,7 +546,7 @@ got: inode->i_ino = ino; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); memset(ei->i_data, 0, sizeof(ei->i_data)); ei->i_flags = ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 75983215c7a1..314b415ee518 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -385,12 +385,16 @@ ext2_blks_to_allocate(Indirect * branch, int k, unsigned long blks, } /** - * ext2_alloc_blocks: multiple allocate blocks needed for a branch - * @indirect_blks: the number of blocks need to allocate for indirect - * blocks - * @blks: the number of blocks need to allocate for direct blocks - * @new_blocks: on return it will store the new block numbers for - * the indirect blocks(if needed) and the first direct block, + * ext2_alloc_blocks: Allocate multiple blocks needed for a branch. + * @inode: Owner. + * @goal: Preferred place for allocation. + * @indirect_blks: The number of blocks needed to allocate for indirect blocks. + * @blks: The number of blocks need to allocate for direct blocks. + * @new_blocks: On return it will store the new block numbers for + * the indirect blocks(if needed) and the first direct block. + * @err: Error pointer. + * + * Return: Number of blocks allocated. */ static int ext2_alloc_blocks(struct inode *inode, ext2_fsblk_t goal, int indirect_blks, int blks, @@ -415,7 +419,7 @@ static int ext2_alloc_blocks(struct inode *inode, while (1) { count = target; /* allocating blocks for indirect blocks and direct blocks */ - current_block = ext2_new_blocks(inode,goal,&count,err); + current_block = ext2_new_blocks(inode, goal, &count, err, 0); if (*err) goto failed_out; @@ -595,7 +599,7 @@ static void ext2_splice_branch(struct inode *inode, if (where->bh) mark_buffer_dirty_inode(where->bh, inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); } @@ -1082,8 +1086,8 @@ no_top: */ static inline void ext2_free_data(struct inode *inode, __le32 *p, __le32 *q) { - unsigned long block_to_free = 0, count = 0; - unsigned long nr; + ext2_fsblk_t block_to_free = 0, count = 0; + ext2_fsblk_t nr; for ( ; p < q ; p++) { nr = le32_to_cpu(*p); @@ -1123,7 +1127,7 @@ static inline void ext2_free_data(struct inode *inode, __le32 *p, __le32 *q) static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int depth) { struct buffer_head * bh; - unsigned long nr; + ext2_fsblk_t nr; if (depth--) { int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb); @@ -1287,7 +1291,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) __ext2_truncate_blocks(inode, newsize); filemap_invalidate_unlock(inode->i_mapping); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); if (inode_needs_sync(inode)) { sync_mapping_buffers(inode->i_mapping); sync_inode_metadata(inode, 1); @@ -1409,9 +1413,9 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le32_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); - inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); + inode_set_ctime(inode, (signed)le32_to_cpu(raw_inode->i_ctime), 0); inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime); - inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0; + inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. * This is needed because nfsd might try to access dead inodes @@ -1541,7 +1545,7 @@ static int __ext2_write_inode(struct inode *inode, int do_sync) raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); raw_inode->i_size = cpu_to_le32(inode->i_size); raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); - raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); + raw_inode->i_ctime = cpu_to_le32(inode_get_ctime(inode).tv_sec); raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); @@ -1628,7 +1632,7 @@ int ext2_getattr(struct mnt_idmap *idmap, const struct path *path, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index cc87d413eb43..44e04484e570 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -44,7 +44,7 @@ int ext2_fileattr_set(struct mnt_idmap *idmap, (fa->flags & EXT2_FL_USER_MODIFIABLE); ext2_set_inode_flags(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); return 0; @@ -77,7 +77,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } inode_lock(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode->i_generation = generation; inode_unlock(inode); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 937dd8f60f96..059517068adc 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -211,7 +211,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, if (err) return err; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_link_count(inode); ihold(inode); @@ -291,7 +291,7 @@ static int ext2_unlink(struct inode *dir, struct dentry *dentry) if (err) goto out; - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); err = 0; out: @@ -367,7 +367,7 @@ static int ext2_rename (struct mnt_idmap * idmap, ext2_put_page(new_page, new_de); if (err) goto out_dir; - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); if (dir_de) drop_nlink(new_inode); inode_dec_link_count(new_inode); @@ -383,7 +383,7 @@ static int ext2_rename (struct mnt_idmap * idmap, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old_inode->i_ctime = current_time(old_inode); + inode_set_ctime_current(old_inode); mark_inode_dirty(old_inode); err = ext2_delete_entry(old_de, old_page); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 2959afc7541c..aaf3e3e88cb2 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1572,7 +1572,7 @@ out: if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); inode_inc_iversion(inode); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); return len - towrite; } diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 8906ba479aaf..20f741184673 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -742,10 +742,13 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, /* We need to allocate a new block */ ext2_fsblk_t goal = ext2_group_first_block_no(sb, EXT2_I(inode)->i_block_group); - int block = ext2_new_block(inode, goal, &error); + unsigned long count = 1; + ext2_fsblk_t block = ext2_new_blocks(inode, goal, + &count, &error, + EXT2_ALLOC_NORESERVE); if (error) goto cleanup; - ea_idebug(inode, "creating block %d", block); + ea_idebug(inode, "creating block %lu", block); new_bh = sb_getblk(sb, block); if (unlikely(!new_bh)) { @@ -773,7 +776,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, /* Update the inode. */ EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (IS_SYNC(inode)) { error = sync_inode_metadata(inode, 1); /* In case sync failed due to ENOSPC the inode was actually diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 86699c8cab28..e20d59221fc0 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -28,6 +28,7 @@ config EXT3_FS_SECURITY config EXT4_FS tristate "The Extended 4 (ext4) filesystem" + select BUFFER_HEAD select JBD2 select CRC16 select CRYPTO diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 27fcbddfb148..3bffe862f954 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -259,7 +259,7 @@ retry: error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */); if (!error && update_mode) { inode->i_mode = mode; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); error = ext4_mark_inode_dirty(handle, inode); } out_stop: diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 1f72f977c6db..79b20d6ae39e 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -913,11 +913,11 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) } /* - * This function returns the number of file system metadata clusters at + * This function returns the number of file system metadata blocks at * the beginning of a block group, including the reserved gdt blocks. */ -static unsigned ext4_num_base_meta_clusters(struct super_block *sb, - ext4_group_t block_group) +unsigned int ext4_num_base_meta_blocks(struct super_block *sb, + ext4_group_t block_group) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned num; @@ -935,8 +935,15 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb, } else { /* For META_BG_BLOCK_GROUPS */ num += ext4_bg_num_gdb_meta(sb, block_group); } - return EXT4_NUM_B2C(sbi, num); + return num; } + +static unsigned int ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group) +{ + return EXT4_NUM_B2C(EXT4_SB(sb), ext4_num_base_meta_blocks(sb, block_group)); +} + /** * ext4_inode_to_goal_block - return a hint for block allocation * @inode: inode for block allocation diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 5504f72bbbbe..6fe3c941b565 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -215,7 +215,6 @@ int ext4_setup_system_zone(struct super_block *sb) struct ext4_system_blocks *system_blks; struct ext4_group_desc *gdp; ext4_group_t i; - int flex_size = ext4_flex_bg_size(sbi); int ret; system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL); @@ -223,12 +222,13 @@ int ext4_setup_system_zone(struct super_block *sb) return -ENOMEM; for (i=0; i < ngroups; i++) { + unsigned int meta_blks = ext4_num_base_meta_blocks(sb, i); + cond_resched(); - if (ext4_bg_has_super(sb, i) && - ((i < 5) || ((i % flex_size) == 0))) { + if (meta_blks != 0) { ret = add_system_zone(system_blks, ext4_group_first_block_no(sb, i), - ext4_bg_num_gdb(sb, i) + 1, 0); + meta_blks, 0); if (ret) goto err; } diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index e20ac0654b3f..453d4da5de52 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -33,6 +33,8 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, #if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, iname, fname); + if (err) + ext4_fname_free_filename(fname); #endif return err; } @@ -51,6 +53,8 @@ int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, #if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); + if (err) + ext4_fname_free_filename(fname); #endif return err; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0a2d55faa095..9418359b1d9d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -176,9 +176,6 @@ enum criteria { EXT4_MB_NUM_CRS }; -/* criteria below which we use fast block scanning and avoid unnecessary IO */ -#define CR_FAST CR_GOAL_LEN_SLOW - /* * Flags used in mballoc's allocation_context flags field. * @@ -868,64 +865,70 @@ struct ext4_inode { * affected filesystem before 2242. */ -static inline __le32 ext4_encode_extra_time(struct timespec64 *time) +static inline __le32 ext4_encode_extra_time(struct timespec64 ts) { - u32 extra =((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK; - return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS)); + u32 extra = ((ts.tv_sec - (s32)ts.tv_sec) >> 32) & EXT4_EPOCH_MASK; + return cpu_to_le32(extra | (ts.tv_nsec << EXT4_EPOCH_BITS)); } -static inline void ext4_decode_extra_time(struct timespec64 *time, - __le32 extra) +static inline struct timespec64 ext4_decode_extra_time(__le32 base, + __le32 extra) { + struct timespec64 ts = { .tv_sec = (signed)le32_to_cpu(base) }; + if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) - time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; - time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; + ts.tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; + ts.tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; + return ts; } -#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ +#define EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, ts) \ do { \ - if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\ - (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ - (raw_inode)->xtime ## _extra = \ - ext4_encode_extra_time(&(inode)->xtime); \ - } \ - else \ - (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (inode)->xtime.tv_sec, S32_MIN, S32_MAX)); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ + (raw_inode)->xtime = cpu_to_le32((ts).tv_sec); \ + (raw_inode)->xtime ## _extra = ext4_encode_extra_time(ts); \ + } else \ + (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX)); \ } while (0) -#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ -do { \ - if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ - (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \ - if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ - (raw_inode)->xtime ## _extra = \ - ext4_encode_extra_time(&(einode)->xtime); \ -} while (0) +#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, (inode)->xtime) + +#define EXT4_INODE_SET_CTIME(inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode)) + +#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + EXT4_INODE_SET_XTIME_VAL(xtime, &((einode)->vfs_inode), \ + raw_inode, (einode)->xtime) + +#define EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode) \ + (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra) ? \ + ext4_decode_extra_time((raw_inode)->xtime, \ + (raw_inode)->xtime ## _extra) : \ + (struct timespec64) { \ + .tv_sec = (signed)le32_to_cpu((raw_inode)->xtime) \ + }) #define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \ do { \ - (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ - if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ - ext4_decode_extra_time(&(inode)->xtime, \ - raw_inode->xtime ## _extra); \ - } \ - else \ - (inode)->xtime.tv_nsec = 0; \ + (inode)->xtime = EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode); \ } while (0) +#define EXT4_INODE_GET_CTIME(inode, raw_inode) \ +do { \ + inode_set_ctime_to_ts(inode, \ + EXT4_INODE_GET_XTIME_VAL(i_ctime, inode, raw_inode)); \ +} while (0) -#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ -do { \ - if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ - (einode)->xtime.tv_sec = \ - (signed)le32_to_cpu((raw_inode)->xtime); \ - else \ - (einode)->xtime.tv_sec = 0; \ - if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ - ext4_decode_extra_time(&(einode)->xtime, \ - raw_inode->xtime ## _extra); \ - else \ - (einode)->xtime.tv_nsec = 0; \ +#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (einode)->xtime = \ + EXT4_INODE_GET_XTIME_VAL(xtime, &(einode->vfs_inode), \ + raw_inode); \ + else \ + (einode)->xtime = (struct timespec64){0, 0}; \ } while (0) #define i_disk_version osd1.linux1.l_i_version @@ -1235,6 +1238,7 @@ struct ext4_inode_info { #define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group * scanning in mballoc */ +#define EXT4_MOUNT2_ABORT 0x00000100 /* Abort filesystem */ #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt @@ -1252,10 +1256,8 @@ struct ext4_inode_info { #define ext4_test_and_set_bit __test_and_set_bit_le #define ext4_set_bit __set_bit_le -#define ext4_set_bit_atomic ext2_set_bit_atomic #define ext4_test_and_clear_bit __test_and_clear_bit_le #define ext4_clear_bit __clear_bit_le -#define ext4_clear_bit_atomic ext2_clear_bit_atomic #define ext4_test_bit test_bit_le #define ext4_find_next_zero_bit find_next_zero_bit_le #define ext4_find_next_bit find_next_bit_le @@ -1702,10 +1704,13 @@ struct ext4_sb_info { const char *s_last_error_func; time64_t s_last_error_time; /* - * If we are in a context where we cannot update error information in - * the on-disk superblock, we queue this work to do it. + * If we are in a context where we cannot update the on-disk + * superblock, we queue the work here. This is used to update + * the error information in the superblock, and for periodic + * updates of the superblock called from the commit callback + * function. */ - struct work_struct s_error_work; + struct work_struct s_sb_upd_work; /* Ext4 fast commit sub transaction ID */ atomic_t s_fc_subtid; @@ -1798,7 +1803,6 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) */ enum { EXT4_MF_MNTDIR_SAMPLED, - EXT4_MF_FS_ABORTED, /* Fatal error detected */ EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */ }; @@ -2222,9 +2226,9 @@ extern int ext4_feature_set_ok(struct super_block *sb, int readonly); #define EXT4_FLAGS_SHUTDOWN 1 #define EXT4_FLAGS_BDEV_IS_DAX 2 -static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) +static inline int ext4_forced_shutdown(struct super_block *sb) { - return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); + return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); } /* @@ -2702,7 +2706,6 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags); extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); -extern void ext4_check_blocks_bitmap(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, struct buffer_head ** bh); @@ -2858,7 +2861,6 @@ extern void ext4_free_inode(handle_t *, struct inode *); extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); extern unsigned long ext4_count_dirs(struct super_block *); -extern void ext4_check_inodes_bitmap(struct super_block *); extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier); @@ -2901,7 +2903,6 @@ extern int ext4_mb_init(struct super_block *); extern int ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); -extern int ext4_mb_reserve_blocks(struct super_block *, int); extern void ext4_discard_preallocations(struct inode *, unsigned int); extern int __init ext4_init_mballoc(void); extern void ext4_exit_mballoc(void); @@ -2924,6 +2925,10 @@ extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, int len, int state); +static inline bool ext4_mb_cr_expensive(enum criteria cr) +{ + return cr >= CR_GOAL_LEN_SLOW; +} /* inode.c */ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, @@ -2977,7 +2982,6 @@ extern void ext4_evict_inode(struct inode *); extern void ext4_clear_inode(struct inode *); extern int ext4_file_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); -extern int ext4_sync_inode(handle_t *, struct inode *); extern void ext4_dirty_inode(struct inode *, int); extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); @@ -3084,6 +3088,8 @@ extern const char *ext4_decode_error(struct super_block *sb, int errno, extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t block_group, unsigned int flags); +extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb, + ext4_group_t block_group); extern __printf(7, 8) void __ext4_error(struct super_block *, const char *, unsigned int, bool, @@ -3525,8 +3531,6 @@ extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); /* inline.c */ extern int ext4_get_max_inline_size(struct inode *inode); extern int ext4_find_inline_data_nolock(struct inode *inode); -extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, - unsigned int len); extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); int ext4_readpage_inline(struct inode *inode, struct folio *folio); @@ -3774,8 +3778,6 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); } -#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) - /* For ioend & aio unwritten conversion wait queues */ #define EXT4_WQ_HASH_SZ 37 #define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 77f318ec8abb..d1a2e6624401 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -67,11 +67,12 @@ static int ext4_journal_check_start(struct super_block *sb) might_sleep(); - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return -EIO; - if (sb_rdonly(sb)) + if (WARN_ON_ONCE(sb_rdonly(sb))) return -EROFS; + WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); journal = EXT4_SB(sb)->s_journal; /* @@ -234,8 +235,7 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, might_sleep(); - if (bh->b_bdev->bd_super) - ext4_check_bdev_write_error(bh->b_bdev->bd_super); + ext4_check_bdev_write_error(sb); if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e4115d338f10..202c76996b62 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4476,12 +4476,12 @@ retry: map.m_lblk += ret; map.m_len = len = len - ret; epos = (loff_t)map.m_lblk << inode->i_blkbits; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (new_size) { if (epos > new_size) epos = new_size; if (ext4_update_inode_size(inode, epos) & 0x1) - inode->i_mtime = inode->i_ctime; + inode->i_mtime = inode_get_ctime(inode); } ret2 = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); @@ -4617,7 +4617,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); @@ -4642,7 +4642,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, goto out_mutex; } - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); if (new_size) ext4_update_inode_size(inode, new_size); ret = ext4_mark_inode_dirty(handle, inode); @@ -5378,7 +5378,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); ret = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); @@ -5488,7 +5488,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; EXT4_I(inode)->i_disksize += len; - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); ret = ext4_mark_inode_dirty(handle, inode); if (ret) goto out_stop; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 9b5b8951afb4..6f7de14c0fa8 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -878,23 +878,29 @@ retry: err1 = __es_remove_extent(inode, lblk, end, NULL, es1); if (err1 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es1) { + if (!es1->es_len) + __es_free_extent(es1); + es1 = NULL; + } err2 = __es_insert_extent(inode, &newes, es2); if (err2 == -ENOMEM && !ext4_es_must_keep(&newes)) err2 = 0; if (err2 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es2) { + if (!es2->es_len) + __es_free_extent(es2); + es2 = NULL; + } if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) && (status & EXTENT_STATUS_WRITTEN || status & EXTENT_STATUS_UNWRITTEN)) __revise_pending(inode, lblk, len); - - /* es is pre-allocated but not used, free it. */ - if (es1 && !es1->es_len) - __es_free_extent(es1); - if (es2 && !es2->es_len) - __es_free_extent(es2); error: write_unlock(&EXT4_I(inode)->i_es_lock); if (err1 || err2) @@ -1491,8 +1497,12 @@ retry: */ write_lock(&EXT4_I(inode)->i_es_lock); err = __es_remove_extent(inode, lblk, end, &reserved, es); - if (es && !es->es_len) - __es_free_extent(es); + /* Free preallocated extent if it didn't get used. */ + if (es) { + if (!es->es_len) + __es_free_extent(es); + es = NULL; + } write_unlock(&EXT4_I(inode)->i_es_lock); if (err) goto retry; @@ -2047,19 +2057,25 @@ retry: err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1); if (err1 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es1) { + if (!es1->es_len) + __es_free_extent(es1); + es1 = NULL; + } err2 = __es_insert_extent(inode, &newes, es2); if (err2 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es2) { + if (!es2->es_len) + __es_free_extent(es2); + es2 = NULL; + } if (allocated) __insert_pending(inode, lblk); - - /* es is pre-allocated but not used, free it. */ - if (es1 && !es1->es_len) - __es_free_extent(es1); - if (es2 && !es2->es_len) - __es_free_extent(es2); error: write_unlock(&EXT4_I(inode)->i_es_lock); if (err1 || err2) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index c457c8517f0f..6830ea3a6c59 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -131,7 +131,7 @@ static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (!iov_iter_count(to)) @@ -153,7 +153,7 @@ static ssize_t ext4_file_splice_read(struct file *in, loff_t *ppos, { struct inode *inode = file_inode(in); - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; return filemap_splice_read(in, ppos, pipe, len, flags); } @@ -476,6 +476,11 @@ restart: * required to change security info in file_modified(), for extending * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten * extents (as partial block zeroing may be required). + * + * Note that unaligned writes are allowed under shared lock so long as + * they are pure overwrites. Otherwise, concurrent unaligned writes risk + * data corruption due to partial block zeroing in the dio layer, and so + * the I/O must occur exclusively. */ if (*ilock_shared && ((!IS_NOSEC(inode) || *extend || !overwrite || @@ -492,21 +497,12 @@ restart: /* * Now that locking is settled, determine dio flags and exclusivity - * requirements. Unaligned writes are allowed under shared lock so long - * as they are pure overwrites. Set the iomap overwrite only flag as an - * added precaution in this case. Even though this is unnecessary, we - * can detect and warn on unexpected -EAGAIN if an unsafe unaligned - * write is ever submitted. - * - * Otherwise, concurrent unaligned writes risk data corruption due to - * partial block zeroing in the dio layer, and so the I/O must occur - * exclusively. The inode lock is already held exclusive if the write is - * non-overwrite or extending, so drain all outstanding dio and set the - * force wait dio flag. + * requirements. We don't use DIO_OVERWRITE_ONLY because we enforce + * behavior already. The inode lock is already held exclusive if the + * write is non-overwrite or extending, so drain all outstanding dio and + * set the force wait dio flag. */ - if (*ilock_shared && unaligned_io) { - *dio_flags = IOMAP_DIO_OVERWRITE_ONLY; - } else if (!*ilock_shared && (unaligned_io || *extend)) { + if (!*ilock_shared && (unaligned_io || *extend)) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; @@ -608,7 +604,6 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) iomap_ops = &ext4_iomap_overwrite_ops; ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, dio_flags, NULL, 0); - WARN_ON_ONCE(ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)); if (ret == -ENOTBLK) ret = 0; @@ -709,7 +704,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; #ifdef CONFIG_FS_DAX @@ -723,8 +718,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } #ifdef CONFIG_FS_DAX -static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, - enum page_entry_size pe_size) +static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order) { int error = 0; vm_fault_t result; @@ -740,7 +734,7 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, * read-only. * * We check for VM_SHARED rather than vmf->cow_page since the latter is - * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for + * unset for order != 0 (i.e. only in do_cow_fault); for * other sizes, dax_iomap_fault will handle splitting / fallback so that * we eventually come back with a COW page. */ @@ -764,7 +758,7 @@ retry: } else { filemap_invalidate_lock_shared(mapping); } - result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops); + result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops); if (write) { ext4_journal_stop(handle); @@ -773,7 +767,7 @@ retry: goto retry; /* Handling synchronous page fault? */ if (result & VM_FAULT_NEEDDSYNC) - result = dax_finish_sync_fault(vmf, pe_size, pfn); + result = dax_finish_sync_fault(vmf, order, pfn); filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); } else { @@ -785,7 +779,7 @@ retry: static vm_fault_t ext4_dax_fault(struct vm_fault *vmf) { - return ext4_dax_huge_fault(vmf, PE_SIZE_PTE); + return ext4_dax_huge_fault(vmf, 0); } static const struct vm_operations_struct ext4_dax_vm_ops = { @@ -807,10 +801,9 @@ static const struct vm_operations_struct ext4_file_vm_ops = { static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file->f_mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct dax_device *dax_dev = sbi->s_daxdev; + struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; - if (unlikely(ext4_forced_shutdown(sbi))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; /* @@ -886,7 +879,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp) { int ret; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 0c56f3a011a1..b40d3b29f7e5 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -131,9 +131,8 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) int ret = 0, err; bool needs_barrier = false; struct inode *inode = file->f_mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (unlikely(ext4_forced_shutdown(sbi))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; ASSERT(ext4_journal_current_handle() == NULL); @@ -141,14 +140,14 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_ext4_sync_file_enter(file, datasync); if (sb_rdonly(inode->i_sb)) { - /* Make sure that we read updated s_mount_flags value */ + /* Make sure that we read updated s_ext4_flags value */ smp_rmb(); - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(inode->i_sb)) ret = -EROFS; goto out; } - if (!sbi->s_journal) { + if (!EXT4_SB(inode->i_sb)->s_journal) { ret = ext4_fsync_nojournal(file, start, end, datasync, &needs_barrier); if (needs_barrier) diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 46c3423ddfa1..deabe29da7fb 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -300,7 +300,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len, unsigned char *buff; struct qstr qstr = {.name = name, .len = len }; - if (len && IS_CASEFOLDED(dir) && um && + if (len && IS_CASEFOLDED(dir) && (!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) { buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); if (!buff) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 754f961cd9fd..b65058d972f9 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -950,7 +950,7 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap, sb = dir->i_sb; sbi = EXT4_SB(sb); - if (unlikely(ext4_forced_shutdown(sbi))) + if (unlikely(ext4_forced_shutdown(sb))) return ERR_PTR(-EIO); ngroups = ext4_get_groups_count(sb); @@ -1250,7 +1250,7 @@ got: inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); ei->i_crtime = inode->i_mtime; memset(ei->i_data, 0, sizeof(ei->i_data)); @@ -1523,12 +1523,6 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int num, ret = 0, used_blks = 0; unsigned long used_inos = 0; - /* This should not happen, but just to be sure check this */ - if (sb_rdonly(sb)) { - ret = 1; - goto out; - } - gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp || !grp) goto out; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index a4b7e4bc32d4..012d9259ff53 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -228,7 +228,7 @@ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, struct ext4_inode *raw_inode; int cp_len = 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; BUG_ON(!EXT4_I(inode)->i_inline_off); @@ -1037,7 +1037,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); ext4_update_dx_flag(dir); inode_inc_iversion(dir); return 1; @@ -1991,7 +1991,7 @@ out: ext4_orphan_del(handle, inode); if (err == 0) { - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); err = ext4_mark_inode_dirty(handle, inode); if (IS_SYNC(inode)) ext4_handle_sync(handle); diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c index 7935ea6cf92c..f0c0fd507fbc 100644 --- a/fs/ext4/inode-test.c +++ b/fs/ext4/inode-test.c @@ -245,9 +245,9 @@ static void inode_test_xtimestamp_decoding(struct kunit *test) struct timestamp_expectation *test_param = (struct timestamp_expectation *)(test->param_value); - timestamp.tv_sec = get_32bit_time(test_param); - ext4_decode_extra_time(×tamp, - cpu_to_le32(test_param->extra_bits)); + timestamp = ext4_decode_extra_time( + cpu_to_le32(get_32bit_time(test_param)), + cpu_to_le32(test_param->extra_bits)); KUNIT_EXPECT_EQ_MSG(test, test_param->expected.tv_sec, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 43775a6ca505..4ce35f1c8b0a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1114,7 +1114,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; unsigned from, to; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; trace_ext4_write_begin(inode, pos, len); @@ -1569,7 +1569,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, if (folio->index < mpd->first_page) continue; - if (folio->index + folio_nr_pages(folio) - 1 > end) + if (folio_next_index(folio) - 1 > end) continue; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); @@ -2213,8 +2213,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, if (err < 0) { struct super_block *sb = inode->i_sb; - if (ext4_forced_shutdown(EXT4_SB(sb)) || - ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(sb)) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. @@ -2455,7 +2454,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) if (mpd->map.m_len == 0) mpd->first_page = folio->index; - mpd->next_page = folio->index + folio_nr_pages(folio); + mpd->next_page = folio_next_index(folio); /* * Writeout when we cannot modify metadata is simple. * Just submit the page. For data=journal mode we @@ -2534,14 +2533,13 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that * will obscure the real source of the problem. We test - * EXT4_MF_FS_ABORTED instead of sb->s_flag's SB_RDONLY because + * fs shutdown state instead of sb->s_flag's SB_RDONLY because * the latter could be true if the filesystem is mounted * read-only, and in that case, ext4_writepages should * *never* be called, so if that ever happens, we would want * the stack trace. */ - if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) || - ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) { + if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) { ret = -EROFS; goto out_writepages; } @@ -2759,7 +2757,7 @@ static int ext4_writepages(struct address_space *mapping, int ret; int alloc_ctx; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return -EIO; alloc_ctx = ext4_writepages_down_read(sb); @@ -2798,16 +2796,16 @@ static int ext4_dax_writepages(struct address_space *mapping, int ret; long nr_to_write = wbc->nr_to_write; struct inode *inode = mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); int alloc_ctx; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc); - ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc); + ret = dax_writeback_mapping_range(mapping, + EXT4_SB(inode->i_sb)->s_daxdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); ext4_writepages_up_read(inode->i_sb, alloc_ctx); @@ -2857,7 +2855,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; struct inode *inode = mapping->host; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; index = pos >> PAGE_SHIFT; @@ -2937,14 +2935,73 @@ static int ext4_da_should_update_i_disksize(struct folio *folio, return 1; } +static int ext4_da_do_write_end(struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page) +{ + struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; + bool disksize_changed = false; + loff_t new_i_size; + + /* + * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES + * flag, which all that's needed to trigger page writeback. + */ + copied = block_write_end(NULL, mapping, pos, len, copied, page, NULL); + new_i_size = pos + copied; + + /* + * It's important to update i_size while still holding page lock, + * because page writeout could otherwise come in and zero beyond + * i_size. + * + * Since we are holding inode lock, we are sure i_disksize <= + * i_size. We also know that if i_disksize < i_size, there are + * delalloc writes pending in the range up to i_size. If the end of + * the current write is <= i_size, there's no need to touch + * i_disksize since writeback will push i_disksize up to i_size + * eventually. If the end of the current write is > i_size and + * inside an allocated block which ext4_da_should_update_i_disksize() + * checked, we need to update i_disksize here as certain + * ext4_writepages() paths not allocating blocks and update i_disksize. + */ + if (new_i_size > inode->i_size) { + unsigned long end; + + i_size_write(inode, new_i_size); + end = (new_i_size - 1) & (PAGE_SIZE - 1); + if (copied && ext4_da_should_update_i_disksize(page_folio(page), end)) { + ext4_update_i_disksize(inode, new_i_size); + disksize_changed = true; + } + } + + unlock_page(page); + put_page(page); + + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); + + if (disksize_changed) { + handle_t *handle; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + } + + return copied; +} + static int ext4_da_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { struct inode *inode = mapping->host; - loff_t new_i_size; - unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; struct folio *folio = page_folio(page); @@ -2963,30 +3020,7 @@ static int ext4_da_write_end(struct file *file, if (unlikely(copied < len) && !PageUptodate(page)) copied = 0; - start = pos & (PAGE_SIZE - 1); - end = start + copied - 1; - - /* - * Since we are holding inode lock, we are sure i_disksize <= - * i_size. We also know that if i_disksize < i_size, there are - * delalloc writes pending in the range upto i_size. If the end of - * the current write is <= i_size, there's no need to touch - * i_disksize since writeback will push i_disksize upto i_size - * eventually. If the end of the current write is > i_size and - * inside an allocated block (ext4_da_should_update_i_disksize() - * check), we need to update i_disksize here as certain - * ext4_writepages() paths not allocating blocks update i_disksize. - * - * Note that we defer inode dirtying to generic_write_end() / - * ext4_da_write_inline_data_end(). - */ - new_i_size = pos + copied; - if (copied && new_i_size > inode->i_size && - ext4_da_should_update_i_disksize(folio, end)) - ext4_update_i_disksize(inode, new_i_size); - - return generic_write_end(file, mapping, pos, len, copied, &folio->page, - fsdata); + return ext4_da_do_write_end(mapping, pos, len, copied, &folio->page); } /* @@ -3986,7 +4020,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) if (IS_SYNC(inode)) ext4_handle_sync(handle); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); ret2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret2)) ret = ret2; @@ -4146,7 +4180,7 @@ out_stop: if (inode->i_nlink) ext4_orphan_del(handle, inode); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2 && !err)) err = err2; @@ -4249,7 +4283,7 @@ static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode } raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); - EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); + EXT4_INODE_SET_CTIME(inode, raw_inode); EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); @@ -4858,7 +4892,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, } } - EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); + EXT4_INODE_GET_CTIME(inode, raw_inode); EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); @@ -4940,9 +4974,12 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, "iget: bogus i_mode (%o)", inode->i_mode); goto bad_inode; } - if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) + if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) { ext4_error_inode(inode, function, line, 0, "casefold flag without casefold feature"); + ret = -EFSCORRUPTED; + goto bad_inode; + } if ((err_str = check_igot_inode(inode, flags)) != NULL) { ext4_error_inode(inode, function, line, 0, err_str); ret = -EFSCORRUPTED; @@ -4981,7 +5018,7 @@ static void __ext4_update_other_inode_time(struct super_block *sb, spin_unlock(&inode->i_lock); spin_lock(&ei->i_raw_lock); - EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); + EXT4_INODE_SET_CTIME(inode, raw_inode); EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); ext4_inode_csum_set(inode, raw_inode, ei); @@ -5131,11 +5168,10 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; - if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) || - sb_rdonly(inode->i_sb)) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (EXT4_SB(inode->i_sb)->s_journal) { @@ -5255,7 +5291,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, const unsigned int ia_valid = attr->ia_valid; bool inc_ivers = true; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (unlikely(IS_IMMUTABLE(inode))) @@ -5376,10 +5412,8 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, * Update c/mtime on truncate up, ext4_truncate() will * update c/mtime in shrink case below */ - if (!shrink) { - inode->i_mtime = current_time(inode); - inode->i_ctime = inode->i_mtime; - } + if (!shrink) + inode->i_mtime = inode_set_ctime_current(inode); if (shrink) ext4_fc_track_range(handle, inode, @@ -5537,7 +5571,7 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, STATX_ATTR_NODUMP | STATX_ATTR_VERITY); - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); return 0; } @@ -5676,7 +5710,7 @@ int ext4_mark_iloc_dirty(handle_t *handle, { int err = 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { + if (unlikely(ext4_forced_shutdown(inode->i_sb))) { put_bh(iloc->bh); return -EIO; } @@ -5702,7 +5736,7 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, { int err; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; err = ext4_get_inode_loc(inode, iloc); @@ -6140,7 +6174,7 @@ retry_alloc: if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_alloc; out_ret: - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); out: filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(inode->i_sb); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 331859511f80..0bfe2ce589e2 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -449,7 +449,8 @@ static long swap_inode_boot_loader(struct super_block *sb, diff = size - size_bl; swap_inode_data(inode, inode_bl); - inode->i_ctime = inode_bl->i_ctime = current_time(inode); + inode_set_ctime_current(inode); + inode_set_ctime_current(inode_bl); inode_inc_iversion(inode); inode->i_generation = get_random_u32(); @@ -663,7 +664,7 @@ static int ext4_ioctl_setflags(struct inode *inode, ext4_set_inode_flags(inode, false); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_iversion(inode); err = ext4_mark_iloc_dirty(handle, inode, &iloc); @@ -774,7 +775,7 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) } EXT4_I(inode)->i_projid = kprojid; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_iversion(inode); out_dirty: rc = ext4_mark_iloc_dirty(handle, inode, &iloc); @@ -801,7 +802,7 @@ int ext4_force_shutdown(struct super_block *sb, u32 flags) if (flags > EXT4_GOING_FLAGS_NOLOGFLUSH) return -EINVAL; - if (ext4_forced_shutdown(sbi)) + if (ext4_forced_shutdown(sb)) return 0; ext4_msg(sb, KERN_ALERT, "shut down requested (%d)", flags); @@ -1266,7 +1267,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } err = ext4_reserve_inode_write(handle, inode, &iloc); if (err == 0) { - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_iversion(inode); inode->i_generation = generation; err = ext4_mark_iloc_dirty(handle, inode, &iloc); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 21b903fe546e..1e599305d85f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -16,6 +16,7 @@ #include <linux/slab.h> #include <linux/nospec.h> #include <linux/backing-dev.h> +#include <linux/freezer.h> #include <trace/events/ext4.h> /* @@ -874,7 +875,7 @@ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_group_info *iter, *grp; + struct ext4_group_info *iter; int i; if (ac->ac_status == AC_STATUS_FOUND) @@ -883,7 +884,6 @@ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED)) atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions); - grp = NULL; for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { if (list_empty(&sbi->s_mb_largest_free_orders[i])) continue; @@ -892,28 +892,22 @@ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); continue; } - grp = NULL; list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], bb_largest_free_order_node) { if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]); if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) { - grp = iter; - break; + *group = iter->bb_group; + ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED; + read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); + return; } } read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); - if (grp) - break; } - if (!grp) { - /* Increment cr and search again */ - *new_cr = CR_GOAL_LEN_FAST; - } else { - *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED; - } + /* Increment cr and search again if no group is found */ + *new_cr = CR_GOAL_LEN_FAST; } /* @@ -966,16 +960,25 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context * for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); i < MB_NUM_ORDERS(ac->ac_sb); i++) { grp = ext4_mb_find_good_group_avg_frag_lists(ac, i); - if (grp) - break; + if (grp) { + *group = grp->bb_group; + ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED; + return; + } } - if (grp) { - *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED; - } else { + /* + * CR_BEST_AVAIL_LEN works based on the concept that we have + * a larger normalized goal len request which can be trimmed to + * a smaller goal len such that it can still satisfy original + * request len. However, allocation request for non-regular + * files never gets normalized. + * See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA). + */ + if (ac->ac_flags & EXT4_MB_HINT_DATA) *new_cr = CR_BEST_AVAIL_LEN; - } + else + *new_cr = CR_GOAL_LEN_SLOW; } /* @@ -1051,18 +1054,16 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context ac->ac_g_ex.fe_len); grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order); - if (grp) - break; + if (grp) { + *group = grp->bb_group; + ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED; + return; + } } - if (grp) { - *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED; - } else { - /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ - ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; - *new_cr = CR_GOAL_LEN_SLOW; - } + /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ + ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; + *new_cr = CR_GOAL_LEN_SLOW; } static inline int should_optimize_scan(struct ext4_allocation_context *ac) @@ -1080,8 +1081,9 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac) * Return next linear group for allocation. If linear traversal should not be * performed, this function just returns the same group */ -static int -next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups) +static ext4_group_t +next_linear_group(struct ext4_allocation_context *ac, ext4_group_t group, + ext4_group_t ngroups) { if (!should_optimize_scan(ac)) goto inc_and_return; @@ -1255,7 +1257,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) { ext4_group_t ngroups; - int blocksize; + unsigned int blocksize; int blocks_per_page; int groups_per_page; int err = 0; @@ -2450,7 +2452,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, break; } - if (ac->ac_criteria < CR_FAST) { + if (!ext4_mb_cr_expensive(ac->ac_criteria)) { /* * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are * sure that this group will have a large enough @@ -2553,7 +2555,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS); - if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp)) + if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) return false; free = grp->bb_free; @@ -2634,7 +2636,12 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, free = grp->bb_free; if (free == 0) goto out; - if (cr <= CR_FAST && free < ac->ac_g_ex.fe_len) + /* + * In all criterias except CR_ANY_FREE we try to avoid groups that + * can't possibly satisfy the full goal request due to insufficient + * free blocks. + */ + if (cr < CR_ANY_FREE && free < ac->ac_g_ex.fe_len) goto out; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) goto out; @@ -2658,7 +2665,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, * sure we locate metadata blocks in the first block group in * the flex_bg if possible. */ - if (cr < CR_FAST && + if (!ext4_mb_cr_expensive(cr) && (!sbi->s_log_groups_per_flex || ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && !(ext4_has_group_desc_csum(sb) && @@ -2787,8 +2794,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) /* * ac->ac_2order is set only if the fe_len is a power of 2 - * if ac->ac_2order is set we also set criteria to 0 so that we - * try exact allocation using buddy. + * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED + * so that we try exact allocation using buddy. */ i = fls(ac->ac_g_ex.fe_len); ac->ac_2order = 0; @@ -2800,10 +2807,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) * requests upto maximum buddy size we have constructed. */ if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { - /* - * This should tell if fe_len is exactly power of 2 - */ - if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) + if (is_power_of_2(ac->ac_g_ex.fe_len)) ac->ac_2order = array_index_nospec(i - 1, MB_NUM_ORDERS(sb)); } @@ -2848,11 +2852,11 @@ repeat: /* * Batch reads of the block allocation bitmaps * to get multiple READs in flight; limit - * prefetching at cr=0/1, otherwise mballoc can - * spend a lot of time loading imperfect groups + * prefetching at inexpensive CR, otherwise mballoc + * can spend a lot of time loading imperfect groups */ if ((prefetch_grp == group) && - (cr >= CR_FAST || + (ext4_mb_cr_expensive(cr) || prefetch_ios < sbi->s_mb_prefetch_limit)) { nr = sbi->s_mb_prefetch; if (ext4_has_feature_flex_bg(sb)) { @@ -3501,11 +3505,10 @@ static void ext4_discard_work(struct work_struct *work) struct super_block *sb = sbi->s_sb; struct ext4_free_data *fd, *nfd; struct ext4_buddy e4b; - struct list_head discard_list; + LIST_HEAD(discard_list); ext4_group_t grp, load_grp; int err = 0; - INIT_LIST_HEAD(&discard_list); spin_lock(&sbi->s_md_lock); list_splice_init(&sbi->s_discard_list, &discard_list); spin_unlock(&sbi->s_md_lock); @@ -3879,12 +3882,10 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_free_data *entry, *tmp; - struct list_head freed_data_list; + LIST_HEAD(freed_data_list); struct list_head *cut_pos = NULL; bool wake; - INIT_LIST_HEAD(&freed_data_list); - spin_lock(&sbi->s_md_lock); list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) { if (entry->efd_tid != commit_tid) @@ -4084,7 +4085,7 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group; ext4_grpblk_t blkoff; - int i, err; + int i, err = 0; int already; unsigned int clen, clen_changed, thisgrp_len; @@ -4222,12 +4223,13 @@ ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_ static inline void ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, - ext4_lblk_t start, ext4_lblk_t end) + ext4_lblk_t start, loff_t end) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_prealloc_space *tmp_pa; - ext4_lblk_t tmp_pa_start, tmp_pa_end; + ext4_lblk_t tmp_pa_start; + loff_t tmp_pa_end; struct rb_node *iter; read_lock(&ei->i_prealloc_lock); @@ -4236,7 +4238,7 @@ ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); tmp_pa_start = tmp_pa->pa_lstart; - tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + tmp_pa_end = pa_logical_end(sbi, tmp_pa); spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) @@ -4258,14 +4260,14 @@ ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, */ static inline void ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, - ext4_lblk_t *start, ext4_lblk_t *end) + ext4_lblk_t *start, loff_t *end) { struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL; struct rb_node *iter; - ext4_lblk_t new_start, new_end; - ext4_lblk_t tmp_pa_start, tmp_pa_end, left_pa_end = -1, right_pa_start = -1; + ext4_lblk_t new_start, tmp_pa_start, right_pa_start = -1; + loff_t new_end, tmp_pa_end, left_pa_end = -1; new_start = *start; new_end = *end; @@ -4284,7 +4286,7 @@ ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); tmp_pa_start = tmp_pa->pa_lstart; - tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + tmp_pa_end = pa_logical_end(sbi, tmp_pa); /* PA must not overlap original request */ spin_lock(&tmp_pa->pa_lock); @@ -4364,8 +4366,7 @@ ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, } if (left_pa) { - left_pa_end = - left_pa->pa_lstart + EXT4_C2B(sbi, left_pa->pa_len); + left_pa_end = pa_logical_end(sbi, left_pa); BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical); } @@ -4404,8 +4405,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_super_block *es = sbi->s_es; int bsbits, max; - ext4_lblk_t end; - loff_t size, start_off; + loff_t size, start_off, end; loff_t orig_size __maybe_unused; ext4_lblk_t start; @@ -4432,7 +4432,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* first, let's learn actual file size * given current request is allocated */ - size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); + size = extent_logical_end(sbi, &ac->ac_o_ex); size = size << bsbits; if (size < i_size_read(ac->ac_inode)) size = i_size_read(ac->ac_inode); @@ -4766,7 +4766,6 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL; - loff_t tmp_pa_end; struct rb_node *iter; ext4_fsblk_t goal_block; @@ -4862,9 +4861,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) * pa can possibly satisfy the request hence check if it overlaps * original logical start and stop searching if it doesn't. */ - tmp_pa_end = (loff_t)tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); - - if (ac->ac_o_ex.fe_logical >= tmp_pa_end) { + if (ac->ac_o_ex.fe_logical >= pa_logical_end(sbi, tmp_pa)) { spin_unlock(&tmp_pa->pa_lock); goto try_group_pa; } @@ -4984,7 +4981,6 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); n = rb_next(n); } - return; } /* @@ -5180,8 +5176,11 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa = ac->ac_pa; if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) { - int new_bex_start; - int new_bex_end; + struct ext4_free_extent ex = { + .fe_logical = ac->ac_g_ex.fe_logical, + .fe_len = ac->ac_orig_goal_len, + }; + loff_t orig_goal_end = extent_logical_end(sbi, &ex); /* we can't allocate as much as normalizer wants. * so, found space must get proper lstart @@ -5200,29 +5199,23 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) * still cover original start * 3. Else, keep the best ex at start of original request. */ - new_bex_end = ac->ac_g_ex.fe_logical + - EXT4_C2B(sbi, ac->ac_orig_goal_len); - new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - if (ac->ac_o_ex.fe_logical >= new_bex_start) - goto adjust_bex; + ex.fe_len = ac->ac_b_ex.fe_len; - new_bex_start = ac->ac_g_ex.fe_logical; - new_bex_end = - new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - if (ac->ac_o_ex.fe_logical < new_bex_end) + ex.fe_logical = orig_goal_end - EXT4_C2B(sbi, ex.fe_len); + if (ac->ac_o_ex.fe_logical >= ex.fe_logical) goto adjust_bex; - new_bex_start = ac->ac_o_ex.fe_logical; - new_bex_end = - new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + ex.fe_logical = ac->ac_g_ex.fe_logical; + if (ac->ac_o_ex.fe_logical < extent_logical_end(sbi, &ex)) + goto adjust_bex; + ex.fe_logical = ac->ac_o_ex.fe_logical; adjust_bex: - ac->ac_b_ex.fe_logical = new_bex_start; + ac->ac_b_ex.fe_logical = ex.fe_logical; BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); - BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical + - EXT4_C2B(sbi, ac->ac_orig_goal_len))); + BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end); } pa->pa_lstart = ac->ac_b_ex.fe_logical; @@ -5419,7 +5412,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; - struct list_head list; + LIST_HEAD(list); struct ext4_buddy e4b; struct ext4_inode_info *ei; int err; @@ -5448,7 +5441,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, goto out_dbg; } - INIT_LIST_HEAD(&list); ext4_lock_group(sb, group); list_for_each_entry_safe(pa, tmp, &grp->bb_prealloc_list, pa_group_list) { @@ -5529,7 +5521,7 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; ext4_group_t group = 0; - struct list_head list; + LIST_HEAD(list); struct ext4_buddy e4b; struct rb_node *iter; int err; @@ -5546,8 +5538,6 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) trace_ext4_discard_preallocations(inode, atomic_read(&ei->i_prealloc_active), needed); - INIT_LIST_HEAD(&list); - if (needed == 0) needed = UINT_MAX; @@ -5671,7 +5661,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb) { ext4_group_t i, ngroups; - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(sb)) return; ngroups = ext4_get_groups_count(sb); @@ -5705,7 +5695,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(sb)) return; mb_debug(sb, "Can't allocate:" @@ -5738,12 +5728,10 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) #else static inline void ext4_mb_show_pa(struct super_block *sb) { - return; } static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) { ext4_mb_show_pa(ac->ac_sb); - return; } #endif @@ -5769,7 +5757,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) group_pa_eligible = sbi->s_mb_group_prealloc > 0; inode_pa_eligible = true; - size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); + size = extent_logical_end(sbi, &ac->ac_o_ex); isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; @@ -5865,13 +5853,11 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, { ext4_group_t group = 0; struct ext4_buddy e4b; - struct list_head discard_list; + LIST_HEAD(discard_list); struct ext4_prealloc_space *pa, *tmp; mb_debug(sb, "discard locality group preallocation\n"); - INIT_LIST_HEAD(&discard_list); - spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], pa_node.lg_list, @@ -5984,12 +5970,9 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) spin_unlock(&lg->lg_prealloc_lock); /* Now trim the list to be not more than 8 elements */ - if (lg_prealloc_count > 8) { + if (lg_prealloc_count > 8) ext4_mb_discard_lg_preallocations(sb, lg, order, lg_prealloc_count); - return; - } - return ; } /* @@ -6102,7 +6085,7 @@ ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp) ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_fsblk_t goal, block; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct ext4_super_block *es = sbi->s_es; goal = ar->goal; if (goal < le32_to_cpu(es->s_first_data_block) || @@ -6643,7 +6626,6 @@ do_more: error_return: brelse(bitmap_bh); ext4_std_error(sb, err); - return; } /** @@ -6746,7 +6728,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, } ext4_mb_clear_bb(handle, inode, block, count, flags); - return; } /** @@ -6926,6 +6907,21 @@ __acquires(bitlock) return ret; } +static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb, + ext4_group_t grp) +{ + if (grp < ext4_get_groups_count(sb)) + return EXT4_CLUSTERS_PER_GROUP(sb) - 1; + return (ext4_blocks_count(EXT4_SB(sb)->s_es) - + ext4_group_first_block_no(sb, grp) - 1) >> + EXT4_CLUSTER_BITS(sb); +} + +static bool ext4_trim_interrupted(void) +{ + return fatal_signal_pending(current) || freezing(current); +} + static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) @@ -6933,11 +6929,13 @@ __acquires(ext4_group_lock_ptr(sb, e4b->bd_group)) __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) { ext4_grpblk_t next, count, free_count; + bool set_trimmed = false; void *bitmap; bitmap = e4b->bd_bitmap; - start = (e4b->bd_info->bb_first_free > start) ? - e4b->bd_info->bb_first_free : start; + if (start == 0 && max >= ext4_last_grp_cluster(sb, e4b->bd_group)) + set_trimmed = true; + start = max(e4b->bd_info->bb_first_free, start); count = 0; free_count = 0; @@ -6951,16 +6949,14 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) int ret = ext4_trim_extent(sb, start, next - start, e4b); if (ret && ret != -EOPNOTSUPP) - break; + return count; count += next - start; } free_count += next - start; start = next + 1; - if (fatal_signal_pending(current)) { - count = -ERESTARTSYS; - break; - } + if (ext4_trim_interrupted()) + return count; if (need_resched()) { ext4_unlock_group(sb, e4b->bd_group); @@ -6972,6 +6968,9 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) break; } + if (set_trimmed) + EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info); + return count; } @@ -6982,7 +6981,6 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) * @start: first group block to examine * @max: last group block to examine * @minblocks: minimum extent block count - * @set_trimmed: set the trimmed flag if at least one block is trimmed * * ext4_trim_all_free walks through group's block bitmap searching for free * extents. When the free extent is found, mark it as used in group buddy @@ -6992,7 +6990,7 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) static ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t max, - ext4_grpblk_t minblocks, bool set_trimmed) + ext4_grpblk_t minblocks) { struct ext4_buddy e4b; int ret; @@ -7009,13 +7007,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_lock_group(sb, group); if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || - minblocks < EXT4_SB(sb)->s_last_trim_minblks) { + minblocks < EXT4_SB(sb)->s_last_trim_minblks) ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); - if (ret >= 0 && set_trimmed) - EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); - } else { + else ret = 0; - } ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); @@ -7048,7 +7043,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); - bool whole_group, eof = false; int ret = 0; start = range->start >> sb->s_blocksize_bits; @@ -7067,10 +7061,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) goto out; } - if (end >= max_blks - 1) { + if (end >= max_blks - 1) end = max_blks - 1; - eof = true; - } if (end <= first_data_blk) goto out; if (start < first_data_blk) @@ -7084,9 +7076,10 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) /* end now represents the last cluster to discard in this group */ end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; - whole_group = true; for (group = first_group; group <= last_group; group++) { + if (ext4_trim_interrupted()) + break; grp = ext4_get_group_info(sb, group); if (!grp) continue; @@ -7103,13 +7096,11 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) * change it for the last group, note that last_cluster is * already computed earlier by ext4_get_group_no_and_offset() */ - if (group == last_group) { + if (group == last_group) end = last_cluster; - whole_group = eof ? true : end == EXT4_CLUSTERS_PER_GROUP(sb) - 1; - } if (grp->bb_free >= minlen) { cnt = ext4_trim_all_free(sb, group, first_cluster, - end, minlen, whole_group); + end, minlen); if (cnt < 0) { ret = cnt; break; @@ -7154,8 +7145,7 @@ ext4_mballoc_query_range( ext4_lock_group(sb, group); - start = (e4b.bd_info->bb_first_free > start) ? - e4b.bd_info->bb_first_free : start; + start = max(e4b.bd_info->bb_first_free, start); if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index df6b5e7c2274..d7aeb5da7d86 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -233,6 +233,20 @@ static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, (fex->fe_start << EXT4_SB(sb)->s_cluster_bits); } +static inline loff_t extent_logical_end(struct ext4_sb_info *sbi, + struct ext4_free_extent *fex) +{ + /* Use loff_t to avoid end exceeding ext4_lblk_t max. */ + return (loff_t)fex->fe_logical + EXT4_C2B(sbi, fex->fe_len); +} + +static inline loff_t pa_logical_end(struct ext4_sb_info *sbi, + struct ext4_prealloc_space *pa) +{ + /* Use loff_t to avoid end exceeding ext4_lblk_t max. */ + return (loff_t)pa->pa_lstart + EXT4_C2B(sbi, pa->pa_len); +} + typedef int (*ext4_mballoc_query_range_fn)( struct super_block *sb, ext4_group_t agno, diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 0aaf38ffcb6e..bd946d0c71b7 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -162,7 +162,7 @@ static int kmmpd(void *data) memcpy(mmp->mmp_nodename, init_utsname()->nodename, sizeof(mmp->mmp_nodename)); - while (!kthread_should_stop() && !sb_rdonly(sb)) { + while (!kthread_should_stop() && !ext4_forced_shutdown(sb)) { if (!ext4_has_feature_mmp(sb)) { ext4_warning(sb, "kmmpd being stopped since MMP feature" " has been disabled."); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index b5af2fc03b2f..18a9e7c47975 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -340,10 +340,8 @@ again: ext4_double_up_write_data_sem(orig_inode, donor_inode); goto data_copy; } - if ((folio_has_private(folio[0]) && - !filemap_release_folio(folio[0], 0)) || - (folio_has_private(folio[1]) && - !filemap_release_folio(folio[1], 0))) { + if (!filemap_release_folio(folio[0], 0) || + !filemap_release_folio(folio[1], 0)) { *err = -EBUSY; goto drop_data_sem; } @@ -362,10 +360,8 @@ data_copy: /* At this point all buffers in range are uptodate, old mapping layout * is no longer required, try to drop it now. */ - if ((folio_has_private(folio[0]) && - !filemap_release_folio(folio[0], 0)) || - (folio_has_private(folio[1]) && - !filemap_release_folio(folio[1], 0))) { + if (!filemap_release_folio(folio[0], 0) || + !filemap_release_folio(folio[1], 0)) { *err = -EBUSY; goto unlock_folios; } @@ -392,14 +388,11 @@ data_copy: for (i = 0; i < block_len_in_page; i++) { *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); if (*err < 0) - break; + goto repair_branches; bh = bh->b_this_page; } - if (!*err) - *err = block_commit_write(&folio[0]->page, from, from + replaced_size); - if (unlikely(*err < 0)) - goto repair_branches; + block_commit_write(&folio[0]->page, from, from + replaced_size); /* Even in case of data=writeback it is reasonable to pin * inode to transaction, to prevent unexpected data loss */ diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 0caf6c730ce3..bbda587f76b8 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -343,17 +343,17 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; + int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); #ifdef PARANOID struct ext4_dir_entry *d, *top; d = (struct ext4_dir_entry *)bh->b_data; top = (struct ext4_dir_entry *)(bh->b_data + - (EXT4_BLOCK_SIZE(inode->i_sb) - - sizeof(struct ext4_dir_entry_tail))); - while (d < top && d->rec_len) + (blocksize - sizeof(struct ext4_dir_entry_tail))); + while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize)) d = (struct ext4_dir_entry *)(((void *)d) + - le16_to_cpu(d->rec_len)); + ext4_rec_len_from_disk(d->rec_len, blocksize)); if (d != top) return NULL; @@ -364,7 +364,8 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, #endif if (t->det_reserved_zero1 || - le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) || + (ext4_rec_len_from_disk(t->det_rec_len, blocksize) != + sizeof(struct ext4_dir_entry_tail)) || t->det_reserved_zero2 || t->det_reserved_ft != EXT4_FT_DIR_CSUM) return NULL; @@ -445,13 +446,14 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode, struct ext4_dir_entry *dp; struct dx_root_info *root; int count_offset; + int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); + unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize); - if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb)) + if (rlen == blocksize) count_offset = 8; - else if (le16_to_cpu(dirent->rec_len) == 12) { + else if (rlen == 12) { dp = (struct ext4_dir_entry *)(((void *)dirent) + 12); - if (le16_to_cpu(dp->rec_len) != - EXT4_BLOCK_SIZE(inode->i_sb) - 12) + if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12) return NULL; root = (struct dx_root_info *)(((void *)dp + 12)); if (root->reserved_zero || @@ -1315,6 +1317,7 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh, unsigned int buflen = bh->b_size; char *base = bh->b_data; struct dx_hash_info h = *hinfo; + int blocksize = EXT4_BLOCK_SIZE(dir->i_sb); if (ext4_has_metadata_csum(dir->i_sb)) buflen -= sizeof(struct ext4_dir_entry_tail); @@ -1335,11 +1338,12 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh, map_tail--; map_tail->hash = h.hash; map_tail->offs = ((char *) de - base)>>2; - map_tail->size = le16_to_cpu(de->rec_len); + map_tail->size = ext4_rec_len_from_disk(de->rec_len, + blocksize); count++; cond_resched(); } - de = ext4_next_entry(de, dir->i_sb->s_blocksize); + de = ext4_next_entry(de, blocksize); } return count; } @@ -1445,7 +1449,7 @@ int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct dx_hash_info *hinfo = &name->hinfo; int len; - if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding || + if (!IS_CASEFOLDED(dir) || (IS_ENCRYPTED(dir) && !fscrypt_has_encryption_key(dir))) { cf_name->name = NULL; return 0; @@ -1496,7 +1500,7 @@ static bool ext4_match(struct inode *parent, #endif #if IS_ENABLED(CONFIG_UNICODE) - if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent) && + if (IS_CASEFOLDED(parent) && (!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) { if (fname->cf_name.name) { struct qstr cf = {.name = fname->cf_name.name, @@ -2203,7 +2207,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); ext4_update_dx_flag(dir); inode_inc_iversion(dir); err2 = ext4_mark_inode_dirty(handle, dir); @@ -2393,7 +2397,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, #if IS_ENABLED(CONFIG_UNICODE) if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) && - sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name)) + utf8_validate(sb->s_encoding, &dentry->d_name)) return -EINVAL; #endif @@ -2799,6 +2803,7 @@ static int ext4_add_nondir(handle_t *handle, return err; } drop_nlink(inode); + ext4_mark_inode_dirty(handle, inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); return err; @@ -3142,7 +3147,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) struct ext4_dir_entry_2 *de; handle_t *handle = NULL; - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + if (unlikely(ext4_forced_shutdown(dir->i_sb))) return -EIO; /* Initialize quotas before so that eventual writes go in @@ -3197,7 +3202,8 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) * recovery. */ inode->i_size = 0; ext4_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + dir->i_mtime = inode_set_ctime_current(dir); + inode_set_ctime_current(inode); retval = ext4_mark_inode_dirty(handle, inode); if (retval) goto end_rmdir; @@ -3271,7 +3277,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name, retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto out_handle; - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); ext4_update_dx_flag(dir); retval = ext4_mark_inode_dirty(handle, dir); if (retval) @@ -3286,7 +3292,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name, drop_nlink(inode); if (!inode->i_nlink) ext4_orphan_add(handle, inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); retval = ext4_mark_inode_dirty(handle, inode); if (dentry && !retval) ext4_fc_track_unlink(handle, dentry); @@ -3301,7 +3307,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) { int retval; - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + if (unlikely(ext4_forced_shutdown(dir->i_sb))) return -EIO; trace_ext4_unlink_enter(dir, dentry); @@ -3369,7 +3375,7 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir, struct fscrypt_str disk_link; int retries = 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + if (unlikely(ext4_forced_shutdown(dir->i_sb))) return -EIO; err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, @@ -3436,6 +3442,7 @@ retry: err_drop_inode: clear_nlink(inode); + ext4_mark_inode_dirty(handle, inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); if (handle) @@ -3463,7 +3470,7 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); ext4_inc_count(inode); ihold(inode); @@ -3641,8 +3648,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, if (ext4_has_feature_filetype(ent->dir->i_sb)) ent->de->file_type = file_type; inode_inc_iversion(ent->dir); - ent->dir->i_ctime = ent->dir->i_mtime = - current_time(ent->dir); + ent->dir->i_mtime = inode_set_ctime_current(ent->dir); retval = ext4_mark_inode_dirty(handle, ent->dir); BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); if (!ent->inlined) { @@ -3941,7 +3947,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old.inode->i_ctime = current_time(old.inode); + inode_set_ctime_current(old.inode); retval = ext4_mark_inode_dirty(handle, old.inode); if (unlikely(retval)) goto end_rename; @@ -3955,9 +3961,9 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (new.inode) { ext4_dec_count(new.inode); - new.inode->i_ctime = current_time(new.inode); + inode_set_ctime_current(new.inode); } - old.dir->i_ctime = old.dir->i_mtime = current_time(old.dir); + old.dir->i_mtime = inode_set_ctime_current(old.dir); ext4_update_dx_flag(old.dir); if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); @@ -4021,6 +4027,7 @@ end_rename: ext4_resetent(handle, &old, old.inode->i_ino, old_file_type); drop_nlink(whiteout); + ext4_mark_inode_dirty(handle, whiteout); ext4_orphan_add(handle, whiteout); } unlock_new_inode(whiteout); @@ -4053,7 +4060,6 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, }; u8 new_file_type; int retval; - struct timespec64 ctime; if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) && !projid_eq(EXT4_I(new_dir)->i_projid, @@ -4147,9 +4153,8 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - ctime = current_time(old.inode); - old.inode->i_ctime = ctime; - new.inode->i_ctime = ctime; + inode_set_ctime_current(old.inode); + inode_set_ctime_current(new.inode); retval = ext4_mark_inode_dirty(handle, old.inode); if (unlikely(retval)) goto end_rename; @@ -4189,7 +4194,7 @@ static int ext4_rename2(struct mnt_idmap *idmap, { int err; - if (unlikely(ext4_forced_shutdown(EXT4_SB(old_dir->i_sb)))) + if (unlikely(ext4_forced_shutdown(old_dir->i_sb))) return -EIO; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 3621f29ec671..dfdd7e5cf038 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -184,7 +184,7 @@ static int ext4_end_io_end(ext4_io_end_t *io_end) io_end->handle = NULL; /* Following call will use up the handle */ ret = ext4_convert_unwritten_io_end_vec(handle, io_end); - if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) { + if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) { ext4_msg(inode->i_sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c94ebf704616..dbebd8b3127e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -93,6 +93,7 @@ static int ext4_get_tree(struct fs_context *fc); static int ext4_reconfigure(struct fs_context *fc); static void ext4_fc_free(struct fs_context *fc); static int ext4_init_fs_context(struct fs_context *fc); +static void ext4_kill_sb(struct super_block *sb); static const struct fs_parameter_spec ext4_param_specs[]; /* @@ -135,12 +136,12 @@ static struct file_system_type ext2_fs_type = { .name = "ext2", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, - .kill_sb = kill_block_super, + .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext2"); MODULE_ALIAS("ext2"); -#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type) +#define IS_EXT2_SB(sb) ((sb)->s_type == &ext2_fs_type) #else #define IS_EXT2_SB(sb) (0) #endif @@ -151,12 +152,12 @@ static struct file_system_type ext3_fs_type = { .name = "ext3", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, - .kill_sb = kill_block_super, + .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext3"); MODULE_ALIAS("ext3"); -#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) +#define IS_EXT3_SB(sb) ((sb)->s_type == &ext3_fs_type) static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, @@ -433,6 +434,57 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) #define ext4_get_tstamp(es, tstamp) \ __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) +#define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */ +#define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */ + +/* + * The ext4_maybe_update_superblock() function checks and updates the + * superblock if needed. + * + * This function is designed to update the on-disk superblock only under + * certain conditions to prevent excessive disk writes and unnecessary + * waking of the disk from sleep. The superblock will be updated if: + * 1. More than an hour has passed since the last superblock update, and + * 2. More than 16MB have been written since the last superblock update. + * + * @sb: The superblock + */ +static void ext4_maybe_update_superblock(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + journal_t *journal = sbi->s_journal; + time64_t now; + __u64 last_update; + __u64 lifetime_write_kbytes; + __u64 diff_size; + + if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) || + !journal || (journal->j_flags & JBD2_UNMOUNT)) + return; + + now = ktime_get_real_seconds(); + last_update = ext4_get_tstamp(es, s_wtime); + + if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC)) + return; + + lifetime_write_kbytes = sbi->s_kbytes_written + + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - + sbi->s_sectors_written_start) >> 1); + + /* Get the number of kilobytes not written to disk to account + * for statistics and compare with a multiple of 16 MB. This + * is used to determine when the next superblock commit should + * occur (i.e. not more often than once per 16MB if there was + * less written in an hour). + */ + diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written); + + if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB) + schedule_work(&EXT4_SB(sb)->s_sb_upd_work); +} + /* * The del_gendisk() function uninitializes the disk-specific data * structures, including the bdi structure, without telling anyone @@ -459,6 +511,7 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) BUG_ON(txn->t_state == T_FINISHED); ext4_process_freed_data(sb, txn->t_tid); + ext4_maybe_update_superblock(sb); spin_lock(&sbi->s_md_lock); while (!list_empty(&txn->t_private_list)) { @@ -657,7 +710,7 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, WARN_ON_ONCE(1); if (!continue_fs && !sb_rdonly(sb)) { - ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); + set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); if (journal) jbd2_journal_abort(journal, -EIO); } @@ -671,7 +724,7 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, * defer superblock flushing to a workqueue. */ if (continue_fs && journal) - schedule_work(&EXT4_SB(sb)->s_error_work); + schedule_work(&EXT4_SB(sb)->s_sb_upd_work); else ext4_commit_super(sb); } @@ -698,10 +751,10 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, sb->s_flags |= SB_RDONLY; } -static void flush_stashed_error_work(struct work_struct *work) +static void update_super_work(struct work_struct *work) { struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, - s_error_work); + s_sb_upd_work); journal_t *journal = sbi->s_journal; handle_t *handle; @@ -715,6 +768,7 @@ static void flush_stashed_error_work(struct work_struct *work) */ if (!sb_rdonly(sbi->s_sb) && journal) { struct buffer_head *sbh = sbi->s_sbh; + bool call_notify_err; handle = jbd2_journal_start(journal, 1); if (IS_ERR(handle)) goto write_directly; @@ -722,6 +776,10 @@ static void flush_stashed_error_work(struct work_struct *work) jbd2_journal_stop(handle); goto write_directly; } + + if (sbi->s_add_error_count > 0) + call_notify_err = true; + ext4_update_super(sbi->s_sb); if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to " @@ -735,7 +793,10 @@ static void flush_stashed_error_work(struct work_struct *work) goto write_directly; } jbd2_journal_stop(handle); - ext4_notify_error_sysfs(sbi); + + if (call_notify_err) + ext4_notify_error_sysfs(sbi); + return; } write_directly: @@ -758,7 +819,7 @@ void __ext4_error(struct super_block *sb, const char *function, struct va_format vaf; va_list args; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return; trace_ext4_error(sb, function, line); @@ -783,7 +844,7 @@ void __ext4_error_inode(struct inode *inode, const char *function, va_list args; struct va_format vaf; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); @@ -818,7 +879,7 @@ void __ext4_error_file(struct file *file, const char *function, struct inode *inode = file_inode(file); char pathname[80], *path; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); @@ -898,7 +959,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, char nbuf[16]; const char *errstr; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return; /* Special case: if the error is EROFS, and we're not already @@ -992,7 +1053,7 @@ __acquires(bitlock) struct va_format vaf; va_list args; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return; trace_ext4_error(sb, function, line); @@ -1018,7 +1079,7 @@ __acquires(bitlock) if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, EFSCORRUPTED, ino, block, function, line); - schedule_work(&EXT4_SB(sb)->s_error_work); + schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } return; } @@ -1096,54 +1157,6 @@ void ext4_update_dynamic_rev(struct super_block *sb) */ } -static void ext4_bdev_mark_dead(struct block_device *bdev) -{ - ext4_force_shutdown(bdev->bd_holder, EXT4_GOING_FLAGS_NOLOGFLUSH); -} - -static const struct blk_holder_ops ext4_holder_ops = { - .mark_dead = ext4_bdev_mark_dead, -}; - -/* - * Open the external journal device - */ -static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) -{ - struct block_device *bdev; - - bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb, - &ext4_holder_ops); - if (IS_ERR(bdev)) - goto fail; - return bdev; - -fail: - ext4_msg(sb, KERN_ERR, - "failed to open journal device unknown-block(%u,%u) %ld", - MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); - return NULL; -} - -/* - * Release the journal device - */ -static void ext4_blkdev_remove(struct ext4_sb_info *sbi) -{ - struct block_device *bdev; - bdev = sbi->s_journal_bdev; - if (bdev) { - /* - * Invalidate the journal device's buffers. We don't want them - * floating about in memory - the physical journal device may - * hotswapped, and it breaks the `ro-after' testing code. - */ - invalidate_bdev(bdev); - blkdev_put(bdev, sbi->s_sb); - sbi->s_journal_bdev = NULL; - } -} - static inline struct inode *orphan_list_entry(struct list_head *l) { return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode; @@ -1278,10 +1291,10 @@ static void ext4_put_super(struct super_block *sb) * Unregister sysfs before destroying jbd2 journal. * Since we could still access attr_journal_task attribute via sysfs * path which could have sbi->s_journal->j_task as NULL - * Unregister sysfs before flush sbi->s_error_work. + * Unregister sysfs before flush sbi->s_sb_upd_work. * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If * read metadata verify failed then will queue error work. - * flush_stashed_error_work will call start_this_handle may trigger + * update_super_work will call start_this_handle may trigger * BUG_ON. */ ext4_unregister_sysfs(sb); @@ -1293,7 +1306,7 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_li_request(sb); ext4_quotas_off(sb, EXT4_MAXQUOTAS); - flush_work(&sbi->s_error_work); + flush_work(&sbi->s_sb_upd_work); destroy_workqueue(sbi->rsv_conversion_wq); ext4_release_orphan_info(sb); @@ -1339,8 +1352,13 @@ static void ext4_put_super(struct super_block *sb) sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); if (sbi->s_journal_bdev) { + /* + * Invalidate the journal device's buffers. We don't want them + * floating about in memory - the physical journal device may + * hotswapped, and it breaks the `ro-after' testing code. + */ sync_blockdev(sbi->s_journal_bdev); - ext4_blkdev_remove(sbi); + invalidate_bdev(sbi->s_journal_bdev); } ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); @@ -1897,6 +1915,7 @@ static const struct mount_opts { {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, #endif + {Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2}, {Opt_err, 0, 0} }; @@ -1965,8 +1984,6 @@ struct ext4_fs_context { unsigned int mask_s_mount_opt; unsigned int vals_s_mount_opt2; unsigned int mask_s_mount_opt2; - unsigned long vals_s_mount_flags; - unsigned long mask_s_mount_flags; unsigned int opt_flags; /* MOPT flags */ unsigned int spec; u32 s_max_batch_time; @@ -2117,12 +2134,6 @@ EXT4_SET_CTX(mount_opt2); EXT4_CLEAR_CTX(mount_opt2); EXT4_TEST_CTX(mount_opt2); -static inline void ctx_set_mount_flag(struct ext4_fs_context *ctx, int bit) -{ - set_bit(bit, &ctx->mask_s_mount_flags); - set_bit(bit, &ctx->vals_s_mount_flags); -} - static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct ext4_fs_context *ctx = fc->fs_private; @@ -2186,9 +2197,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option", param->key); return 0; - case Opt_abort: - ctx_set_mount_flag(ctx, EXT4_MF_FS_ABORTED); - return 0; case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT ctx_set_flags(ctx, SB_INLINECRYPT); @@ -2842,8 +2850,6 @@ static void ext4_apply_options(struct fs_context *fc, struct super_block *sb) sbi->s_mount_opt |= ctx->vals_s_mount_opt; sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2; sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2; - sbi->s_mount_flags &= ~ctx->mask_s_mount_flags; - sbi->s_mount_flags |= ctx->vals_s_mount_flags; sb->s_flags &= ~ctx->mask_s_flags; sb->s_flags |= ctx->vals_s_flags; @@ -4232,7 +4238,7 @@ int ext4_calculate_overhead(struct super_block *sb) else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ j_inode = ext4_get_journal_inode(sb, j_inum); - if (j_inode) { + if (!IS_ERR(j_inode)) { j_blocks = j_inode->i_size >> sb->s_blocksize_bits; overhead += EXT4_NUM_B2C(sbi, j_blocks); iput(j_inode); @@ -4970,8 +4976,8 @@ static int ext4_load_and_init_journal(struct super_block *sb, return 0; out: - /* flush s_error_work before journal destroy. */ - flush_work(&sbi->s_error_work); + /* flush s_sb_upd_work before destroying the journal. */ + flush_work(&sbi->s_sb_upd_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; return -EINVAL; @@ -5294,7 +5300,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) timer_setup(&sbi->s_err_report, print_daily_error_info, 0); spin_lock_init(&sbi->s_error_lock); - INIT_WORK(&sbi->s_error_work, flush_stashed_error_work); + INIT_WORK(&sbi->s_sb_upd_work, update_super_work); err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); if (err) @@ -5572,7 +5578,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) spin_lock_init(&sbi->s_bdev_wb_lock); errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, &sbi->s_bdev_wb_err); - sb->s_bdev->bd_super = sb; EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; @@ -5638,16 +5643,16 @@ failed_mount_wq: sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { - /* flush s_error_work before journal destroy. */ - flush_work(&sbi->s_error_work); + /* flush s_sb_upd_work before journal destroy. */ + flush_work(&sbi->s_sb_upd_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } failed_mount3a: ext4_es_unregister_shrinker(sbi); failed_mount3: - /* flush s_error_work before sbi destroy */ - flush_work(&sbi->s_error_work); + /* flush s_sb_upd_work before sbi destroy */ + flush_work(&sbi->s_sb_upd_work); del_timer_sync(&sbi->s_err_report); ext4_stop_mmpd(sbi); ext4_group_desc_free(sbi); @@ -5664,9 +5669,11 @@ failed_mount: kfree(get_qf_name(sb, sbi, i)); #endif fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); - /* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */ brelse(sbi->s_sbh); - ext4_blkdev_remove(sbi); + if (sbi->s_journal_bdev) { + invalidate_bdev(sbi->s_journal_bdev); + blkdev_put(sbi->s_journal_bdev, sb); + } out_fail: invalidate_bdev(sb->s_bdev); sb->s_fs_info = NULL; @@ -5772,22 +5779,22 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL); if (IS_ERR(journal_inode)) { ext4_msg(sb, KERN_ERR, "no journal found"); - return NULL; + return ERR_CAST(journal_inode); } if (!journal_inode->i_nlink) { make_bad_inode(journal_inode); iput(journal_inode); ext4_msg(sb, KERN_ERR, "journal inode is deleted"); - return NULL; + return ERR_PTR(-EFSCORRUPTED); } - - ext4_debug("Journal inode found at %p: %lld bytes\n", - journal_inode, journal_inode->i_size); if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) { ext4_msg(sb, KERN_ERR, "invalid journal inode"); iput(journal_inode); - return NULL; + return ERR_PTR(-EFSCORRUPTED); } + + ext4_debug("Journal inode found at %p: %lld bytes\n", + journal_inode, journal_inode->i_size); return journal_inode; } @@ -5813,24 +5820,21 @@ static int ext4_journal_bmap(journal_t *journal, sector_t *block) return 0; } -static journal_t *ext4_get_journal(struct super_block *sb, - unsigned int journal_inum) +static journal_t *ext4_open_inode_journal(struct super_block *sb, + unsigned int journal_inum) { struct inode *journal_inode; journal_t *journal; - if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) - return NULL; - journal_inode = ext4_get_journal_inode(sb, journal_inum); - if (!journal_inode) - return NULL; + if (IS_ERR(journal_inode)) + return ERR_CAST(journal_inode); journal = jbd2_journal_init_inode(journal_inode); - if (!journal) { + if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "Could not load journal inode"); iput(journal_inode); - return NULL; + return ERR_CAST(journal); } journal->j_private = sb; journal->j_bmap = ext4_journal_bmap; @@ -5838,40 +5842,47 @@ static journal_t *ext4_get_journal(struct super_block *sb, return journal; } -static journal_t *ext4_get_dev_journal(struct super_block *sb, - dev_t j_dev) +static struct block_device *ext4_get_journal_blkdev(struct super_block *sb, + dev_t j_dev, ext4_fsblk_t *j_start, + ext4_fsblk_t *j_len) { struct buffer_head *bh; - journal_t *journal; - ext4_fsblk_t start; - ext4_fsblk_t len; + struct block_device *bdev; int hblock, blocksize; ext4_fsblk_t sb_block; unsigned long offset; struct ext4_super_block *es; - struct block_device *bdev; - - if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) - return NULL; + int errno; - bdev = ext4_blkdev_get(j_dev, sb); - if (bdev == NULL) - return NULL; + /* see get_tree_bdev why this is needed and safe */ + up_write(&sb->s_umount); + bdev = blkdev_get_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb, + &fs_holder_ops); + down_write(&sb->s_umount); + if (IS_ERR(bdev)) { + ext4_msg(sb, KERN_ERR, + "failed to open journal device unknown-block(%u,%u) %ld", + MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev)); + return ERR_CAST(bdev); + } blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { ext4_msg(sb, KERN_ERR, "blocksize too small for journal device"); + errno = -EINVAL; goto out_bdev; } sb_block = EXT4_MIN_BLOCK_SIZE / blocksize; offset = EXT4_MIN_BLOCK_SIZE % blocksize; set_blocksize(bdev, blocksize); - if (!(bh = __bread(bdev, sb_block, blocksize))) { + bh = __bread(bdev, sb_block, blocksize); + if (!bh) { ext4_msg(sb, KERN_ERR, "couldn't read superblock of " "external journal"); + errno = -EINVAL; goto out_bdev; } @@ -5879,57 +5890,74 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || !(le32_to_cpu(es->s_feature_incompat) & EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { - ext4_msg(sb, KERN_ERR, "external journal has " - "bad superblock"); - brelse(bh); - goto out_bdev; + ext4_msg(sb, KERN_ERR, "external journal has bad superblock"); + errno = -EFSCORRUPTED; + goto out_bh; } if ((le32_to_cpu(es->s_feature_ro_compat) & EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && es->s_checksum != ext4_superblock_csum(sb, es)) { - ext4_msg(sb, KERN_ERR, "external journal has " - "corrupt superblock"); - brelse(bh); - goto out_bdev; + ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock"); + errno = -EFSCORRUPTED; + goto out_bh; } if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { ext4_msg(sb, KERN_ERR, "journal UUID does not match"); - brelse(bh); - goto out_bdev; + errno = -EFSCORRUPTED; + goto out_bh; } - len = ext4_blocks_count(es); - start = sb_block + 1; - brelse(bh); /* we're done with the superblock */ + *j_start = sb_block + 1; + *j_len = ext4_blocks_count(es); + brelse(bh); + return bdev; + +out_bh: + brelse(bh); +out_bdev: + blkdev_put(bdev, sb); + return ERR_PTR(errno); +} - journal = jbd2_journal_init_dev(bdev, sb->s_bdev, - start, len, blocksize); - if (!journal) { +static journal_t *ext4_open_dev_journal(struct super_block *sb, + dev_t j_dev) +{ + journal_t *journal; + ext4_fsblk_t j_start; + ext4_fsblk_t j_len; + struct block_device *journal_bdev; + int errno = 0; + + journal_bdev = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); + if (IS_ERR(journal_bdev)) + return ERR_CAST(journal_bdev); + + journal = jbd2_journal_init_dev(journal_bdev, sb->s_bdev, j_start, + j_len, sb->s_blocksize); + if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "failed to create device journal"); + errno = PTR_ERR(journal); goto out_bdev; } - journal->j_private = sb; - if (ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, true)) { - ext4_msg(sb, KERN_ERR, "I/O error on journal device"); - goto out_journal; - } if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { ext4_msg(sb, KERN_ERR, "External journal has more than one " "user (unsupported) - %d", be32_to_cpu(journal->j_superblock->s_nr_users)); + errno = -EINVAL; goto out_journal; } - EXT4_SB(sb)->s_journal_bdev = bdev; + journal->j_private = sb; + EXT4_SB(sb)->s_journal_bdev = journal_bdev; ext4_init_journal_params(sb, journal); return journal; out_journal: jbd2_journal_destroy(journal); out_bdev: - blkdev_put(bdev, sb); - return NULL; + blkdev_put(journal_bdev, sb); + return ERR_PTR(errno); } static int ext4_load_journal(struct super_block *sb, @@ -5961,13 +5989,13 @@ static int ext4_load_journal(struct super_block *sb, } if (journal_inum) { - journal = ext4_get_journal(sb, journal_inum); - if (!journal) - return -EINVAL; + journal = ext4_open_inode_journal(sb, journal_inum); + if (IS_ERR(journal)) + return PTR_ERR(journal); } else { - journal = ext4_get_dev_journal(sb, journal_dev); - if (!journal) - return -EINVAL; + journal = ext4_open_dev_journal(sb, journal_dev); + if (IS_ERR(journal)) + return PTR_ERR(journal); } journal_dev_ro = bdev_read_only(journal->j_dev); @@ -6084,7 +6112,7 @@ static void ext4_update_super(struct super_block *sb) * the clock is set in the future, and this will cause e2fsck * to complain and force a full file system check. */ - if (!(sb->s_flags & SB_RDONLY)) + if (!sb_rdonly(sb)) ext4_update_tstamp(es, s_wtime); es->s_kbytes_written = cpu_to_le64(sbi->s_kbytes_written + @@ -6282,13 +6310,7 @@ static int ext4_clear_journal_err(struct super_block *sb, */ int ext4_force_commit(struct super_block *sb) { - journal_t *journal; - - if (sb_rdonly(sb)) - return 0; - - journal = EXT4_SB(sb)->s_journal; - return ext4_journal_force_commit(journal); + return ext4_journal_force_commit(EXT4_SB(sb)->s_journal); } static int ext4_sync_fs(struct super_block *sb, int wait) @@ -6298,7 +6320,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) bool needs_barrier = false; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (unlikely(ext4_forced_shutdown(sbi))) + if (unlikely(ext4_forced_shutdown(sb))) return 0; trace_ext4_sync_fs(sb, wait); @@ -6347,12 +6369,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) static int ext4_freeze(struct super_block *sb) { int error = 0; - journal_t *journal; - - if (sb_rdonly(sb)) - return 0; - - journal = EXT4_SB(sb)->s_journal; + journal_t *journal = EXT4_SB(sb)->s_journal; if (journal) { /* Now we set up the journal barrier. */ @@ -6386,7 +6403,7 @@ out: */ static int ext4_unfreeze(struct super_block *sb) { - if (sb_rdonly(sb) || ext4_forced_shutdown(EXT4_SB(sb))) + if (ext4_forced_shutdown(sb)) return 0; if (EXT4_SB(sb)->s_journal) { @@ -6502,7 +6519,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) goto restore_opts; } - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (test_opt2(sb, ABORT)) ext4_abort(sb, ESHUTDOWN, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | @@ -6516,10 +6533,10 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) } /* Flush outstanding errors before changing fs state */ - flush_work(&sbi->s_error_work); + flush_work(&sbi->s_sb_upd_work); if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) { + if (ext4_forced_shutdown(sb)) { err = -EROFS; goto restore_opts; } @@ -6680,7 +6697,7 @@ restore_opts: * If there was a failing r/w to ro transition, we may need to * re-enable quota */ - if ((sb->s_flags & SB_RDONLY) && !(old_sb_flags & SB_RDONLY) && + if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) && sb_any_quota_suspended(sb)) dquot_resume(sb, -1); sb->s_flags = old_sb_flags; @@ -7089,6 +7106,13 @@ static int ext4_quota_off(struct super_block *sb, int type) err = dquot_quota_off(sb, type); if (err || ext4_has_feature_quota(sb)) goto out_put; + /* + * When the filesystem was remounted read-only first, we cannot cleanup + * inode flags here. Bad luck but people should be using QUOTA feature + * these days anyway. + */ + if (sb_rdonly(sb)) + goto out_put; inode_lock(inode); /* @@ -7103,7 +7127,7 @@ static int ext4_quota_off(struct super_block *sb, int type) } EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL); inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); out_unlock: @@ -7273,12 +7297,23 @@ static inline int ext3_feature_set_ok(struct super_block *sb) return 1; } +static void ext4_kill_sb(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct block_device *journal_bdev = sbi ? sbi->s_journal_bdev : NULL; + + kill_block_super(sb); + + if (journal_bdev) + blkdev_put(journal_bdev, sb); +} + static struct file_system_type ext4_fs_type = { .owner = THIS_MODULE, .name = "ext4", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, - .kill_sb = kill_block_super, + .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("ext4"); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 05151d61b00b..92ba28cebac6 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -356,13 +356,13 @@ ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) { - return ((u64)ea_inode->i_ctime.tv_sec << 32) | + return ((u64) inode_get_ctime(ea_inode).tv_sec << 32) | (u32) inode_peek_iversion_raw(ea_inode); } static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count) { - ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32); + inode_set_ctime(ea_inode, (u32)(ref_count >> 32), 0); inode_set_iversion_raw(ea_inode, ref_count & 0xffffffff); } @@ -701,7 +701,7 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name, { int error; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (strlen(name) > 255) @@ -2473,7 +2473,7 @@ retry_inode: } if (!error) { ext4_xattr_update_super_block(handle, inode->i_sb); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_iversion(inode); if (!value) no_expand = 0; diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 03ef087537c7..68a1e23e1557 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -2,6 +2,7 @@ config F2FS_FS tristate "F2FS filesystem support" depends on BLOCK + select BUFFER_HEAD select NLS select CRYPTO select CRYPTO_CRC32 diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 8fd3b7f9fb88..b0597a539fc5 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1701,9 +1701,9 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) } f2fs_restore_inmem_curseg(sbi); + stat_inc_cp_count(sbi); stop: unblock_operations(sbi); - stat_inc_cp_count(sbi->stat_info); if (cpc->reason & CP_RECOVERY) f2fs_notice(sbi, "checkpoint: version = %llx", ckpt_ver); diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 236d890f560b..d820801f473e 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -649,13 +649,8 @@ static int f2fs_compress_pages(struct compress_ctx *cc) goto destroy_compress_ctx; } - for (i = 0; i < cc->nr_cpages; i++) { + for (i = 0; i < cc->nr_cpages; i++) cc->cpages[i] = f2fs_compress_alloc_page(); - if (!cc->cpages[i]) { - ret = -ENOMEM; - goto out_free_cpages; - } - } cc->rbuf = f2fs_vmap(cc->rpages, cc->cluster_size); if (!cc->rbuf) { @@ -1045,7 +1040,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, struct address_space *mapping = cc->inode->i_mapping; struct page *page; sector_t last_block_in_bio; - unsigned fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT; + fgf_t fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT; pgoff_t start_idx = start_idx_of_cluster(cc); int i, ret; @@ -1574,8 +1569,6 @@ static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, } dic->tpages[i] = f2fs_compress_alloc_page(); - if (!dic->tpages[i]) - return -ENOMEM; } dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); @@ -1656,11 +1649,6 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) struct page *page; page = f2fs_compress_alloc_page(); - if (!page) { - ret = -ENOMEM; - goto out_free; - } - f2fs_set_compressed_page(page, cc->inode, start_idx + i + 1, dic); dic->cpages[i] = page; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5882afe71d82..916e317ac925 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1167,6 +1167,9 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, f2fs_wait_on_block_writeback(inode, blkaddr); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + iostat_update_and_unbind_ctx(bio); + if (bio->bi_private) + mempool_free(bio->bi_private, bio_post_read_ctx_pool); bio_put(bio); return -EFAULT; } @@ -1389,18 +1392,14 @@ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, { struct address_space *mapping = inode->i_mapping; struct page *page; -repeat: + page = f2fs_get_read_data_page(inode, index, 0, for_write, NULL); if (IS_ERR(page)) return page; /* wait for read completion */ lock_page(page); - if (unlikely(page->mapping != mapping)) { - f2fs_put_page(page, 1); - goto repeat; - } - if (unlikely(!PageUptodate(page))) { + if (unlikely(page->mapping != mapping || !PageUptodate(page))) { f2fs_put_page(page, 1); return ERR_PTR(-EIO); } @@ -3236,8 +3235,7 @@ result: } goto next; } - done_index = folio->index + - folio_nr_pages(folio); + done_index = folio_next_index(folio); done = 1; break; } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 61c35b59126e..fdbf994f1271 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -215,6 +215,9 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->valid_blks[type] += blks; } + for (i = 0; i < MAX_CALL_TYPE; i++) + si->cp_call_count[i] = atomic_read(&sbi->cp_call_count[i]); + for (i = 0; i < 2; i++) { si->segment_count[i] = sbi->segment_count[i]; si->block_count[i] = sbi->block_count[i]; @@ -497,7 +500,9 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", si->prefree_count, si->free_segs, si->free_secs); seq_printf(s, "CP calls: %d (BG: %d)\n", - si->cp_count, si->bg_cp_count); + si->cp_call_count[TOTAL_CALL], + si->cp_call_count[BACKGROUND]); + seq_printf(s, "CP count: %d\n", si->cp_count); seq_printf(s, " - cp blocks : %u\n", si->meta_count[META_CP]); seq_printf(s, " - sit blocks : %u\n", si->meta_count[META_SIT]); @@ -511,12 +516,24 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Total : %4d\n", si->nr_total_ckpt); seq_printf(s, " - Cur time : %4d(ms)\n", si->cur_ckpt_time); seq_printf(s, " - Peak time : %4d(ms)\n", si->peak_ckpt_time); - seq_printf(s, "GC calls: %d (BG: %d)\n", - si->call_count, si->bg_gc); - seq_printf(s, " - data segments : %d (%d)\n", - si->data_segs, si->bg_data_segs); - seq_printf(s, " - node segments : %d (%d)\n", - si->node_segs, si->bg_node_segs); + seq_printf(s, "GC calls: %d (gc_thread: %d)\n", + si->gc_call_count[BACKGROUND] + + si->gc_call_count[FOREGROUND], + si->gc_call_count[BACKGROUND]); + if (__is_large_section(sbi)) { + seq_printf(s, " - data sections : %d (BG: %d)\n", + si->gc_secs[DATA][BG_GC] + si->gc_secs[DATA][FG_GC], + si->gc_secs[DATA][BG_GC]); + seq_printf(s, " - node sections : %d (BG: %d)\n", + si->gc_secs[NODE][BG_GC] + si->gc_secs[NODE][FG_GC], + si->gc_secs[NODE][BG_GC]); + } + seq_printf(s, " - data segments : %d (BG: %d)\n", + si->gc_segs[DATA][BG_GC] + si->gc_segs[DATA][FG_GC], + si->gc_segs[DATA][BG_GC]); + seq_printf(s, " - node segments : %d (BG: %d)\n", + si->gc_segs[NODE][BG_GC] + si->gc_segs[NODE][FG_GC], + si->gc_segs[NODE][BG_GC]); seq_puts(s, " - Reclaimed segs :\n"); seq_printf(s, " - Normal : %d\n", sbi->gc_reclaimed_segs[GC_NORMAL]); seq_printf(s, " - Idle CB : %d\n", sbi->gc_reclaimed_segs[GC_IDLE_CB]); @@ -687,6 +704,8 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inplace_count, 0); for (i = META_CP; i < META_MAX; i++) atomic_set(&sbi->meta_count[i], 0); + for (i = 0; i < MAX_CALL_TYPE; i++) + atomic_set(&sbi->cp_call_count[i], 0); atomic_set(&sbi->max_aw_cnt, 0); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index d635c58cf5a3..8aa29fe2e87b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -455,7 +455,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, de->file_type = fs_umode_to_ftype(inode->i_mode); set_page_dirty(page); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); f2fs_mark_inode_dirty_sync(dir, false); f2fs_put_page(page, 1); } @@ -609,7 +609,7 @@ void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode, f2fs_i_links_write(dir, true); clear_inode_flag(inode, FI_NEW_INODE); } - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); f2fs_mark_inode_dirty_sync(dir, false); if (F2FS_I(dir)->i_current_depth != current_depth) @@ -858,7 +858,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode) if (S_ISDIR(inode->i_mode)) f2fs_i_links_write(dir, false); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); f2fs_i_links_write(inode, false); if (S_ISDIR(inode->i_mode)) { @@ -919,7 +919,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, } f2fs_put_page(page, 1); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); f2fs_mark_inode_dirty_sync(dir, false); if (inode) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c7cb2177b252..6d688e42d89c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1383,6 +1383,13 @@ enum errors_option { MOUNT_ERRORS_PANIC, /* panic on errors */ }; +enum { + BACKGROUND, + FOREGROUND, + MAX_CALL_TYPE, + TOTAL_CALL = FOREGROUND, +}; + static inline int f2fs_test_bit(unsigned int nr, char *addr); static inline void f2fs_set_bit(unsigned int nr, char *addr); static inline void f2fs_clear_bit(unsigned int nr, char *addr); @@ -1695,6 +1702,7 @@ struct f2fs_sb_info { unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ unsigned int other_skip_bggc; /* skip background gc for other reasons */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ + atomic_t cp_call_count[MAX_CALL_TYPE]; /* # of cp call */ #endif spinlock_t stat_lock; /* lock for stat operations */ @@ -2114,15 +2122,6 @@ static inline int f2fs_down_read_trylock(struct f2fs_rwsem *sem) return down_read_trylock(&sem->internal_rwsem); } -#ifdef CONFIG_DEBUG_LOCK_ALLOC -static inline void f2fs_down_read_nested(struct f2fs_rwsem *sem, int subclass) -{ - down_read_nested(&sem->internal_rwsem, subclass); -} -#else -#define f2fs_down_read_nested(sem, subclass) f2fs_down_read(sem) -#endif - static inline void f2fs_up_read(struct f2fs_rwsem *sem) { up_read(&sem->internal_rwsem); @@ -2133,6 +2132,21 @@ static inline void f2fs_down_write(struct f2fs_rwsem *sem) down_write(&sem->internal_rwsem); } +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static inline void f2fs_down_read_nested(struct f2fs_rwsem *sem, int subclass) +{ + down_read_nested(&sem->internal_rwsem, subclass); +} + +static inline void f2fs_down_write_nested(struct f2fs_rwsem *sem, int subclass) +{ + down_write_nested(&sem->internal_rwsem, subclass); +} +#else +#define f2fs_down_read_nested(sem, subclass) f2fs_down_read(sem) +#define f2fs_down_write_nested(sem, subclass) f2fs_down_write(sem) +#endif + static inline int f2fs_down_write_trylock(struct f2fs_rwsem *sem) { return down_write_trylock(&sem->internal_rwsem); @@ -2736,7 +2750,7 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, static inline struct page *f2fs_pagecache_get_page( struct address_space *mapping, pgoff_t index, - int fgp_flags, gfp_t gfp_mask) + fgf_t fgp_flags, gfp_t gfp_mask) { if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) return NULL; @@ -3303,9 +3317,11 @@ static inline void clear_file(struct inode *inode, int type) static inline bool f2fs_is_time_consistent(struct inode *inode) { + struct timespec64 ctime = inode_get_ctime(inode); + if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime)) return false; - if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime)) + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &ctime)) return false; if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) return false; @@ -3885,7 +3901,7 @@ struct f2fs_stat_info { int nats, dirty_nats, sits, dirty_sits; int free_nids, avail_nids, alloc_nids; int total_count, utilization; - int bg_gc, nr_wb_cp_data, nr_wb_data; + int nr_wb_cp_data, nr_wb_data; int nr_rd_data, nr_rd_node, nr_rd_meta; int nr_dio_read, nr_dio_write; unsigned int io_skip_bggc, other_skip_bggc; @@ -3905,9 +3921,11 @@ struct f2fs_stat_info { int rsvd_segs, overp_segs; int dirty_count, node_pages, meta_pages, compress_pages; int compress_page_hit; - int prefree_count, call_count, cp_count, bg_cp_count; - int tot_segs, node_segs, data_segs, free_segs, free_secs; - int bg_node_segs, bg_data_segs; + int prefree_count, free_segs, free_secs; + int cp_call_count[MAX_CALL_TYPE], cp_count; + int gc_call_count[MAX_CALL_TYPE]; + int gc_segs[2][2]; + int gc_secs[2][2]; int tot_blks, data_blks, node_blks; int bg_data_blks, bg_node_blks; int curseg[NR_CURSEG_TYPE]; @@ -3929,10 +3947,9 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) return (struct f2fs_stat_info *)sbi->stat_info; } -#define stat_inc_cp_count(si) ((si)->cp_count++) -#define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++) -#define stat_inc_call_count(si) ((si)->call_count++) -#define stat_inc_bggc_count(si) ((si)->bg_gc++) +#define stat_inc_cp_call_count(sbi, foreground) \ + atomic_inc(&sbi->cp_call_count[(foreground)]) +#define stat_inc_cp_count(si) (F2FS_STAT(sbi)->cp_count++) #define stat_io_skip_bggc_count(sbi) ((sbi)->io_skip_bggc++) #define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++) #define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) @@ -4017,18 +4034,12 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ } while (0) -#define stat_inc_seg_count(sbi, type, gc_type) \ - do { \ - struct f2fs_stat_info *si = F2FS_STAT(sbi); \ - si->tot_segs++; \ - if ((type) == SUM_TYPE_DATA) { \ - si->data_segs++; \ - si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \ - } else { \ - si->node_segs++; \ - si->bg_node_segs += (gc_type == BG_GC) ? 1 : 0; \ - } \ - } while (0) +#define stat_inc_gc_call_count(sbi, foreground) \ + (F2FS_STAT(sbi)->gc_call_count[(foreground)]++) +#define stat_inc_gc_sec_count(sbi, type, gc_type) \ + (F2FS_STAT(sbi)->gc_secs[(type)][(gc_type)]++) +#define stat_inc_gc_seg_count(sbi, type, gc_type) \ + (F2FS_STAT(sbi)->gc_segs[(type)][(gc_type)]++) #define stat_inc_tot_blk_count(si, blks) \ ((si)->tot_blks += (blks)) @@ -4055,10 +4066,8 @@ void __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #else -#define stat_inc_cp_count(si) do { } while (0) -#define stat_inc_bg_cp_count(si) do { } while (0) -#define stat_inc_call_count(si) do { } while (0) -#define stat_inc_bggc_count(si) do { } while (0) +#define stat_inc_cp_call_count(sbi, foreground) do { } while (0) +#define stat_inc_cp_count(sbi) do { } while (0) #define stat_io_skip_bggc_count(sbi) do { } while (0) #define stat_other_skip_bggc_count(sbi) do { } while (0) #define stat_inc_dirty_inode(sbi, type) do { } while (0) @@ -4086,7 +4095,9 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_inc_seg_type(sbi, curseg) do { } while (0) #define stat_inc_block_count(sbi, curseg) do { } while (0) #define stat_inc_inplace_blocks(sbi) do { } while (0) -#define stat_inc_seg_count(sbi, type, gc_type) do { } while (0) +#define stat_inc_gc_call_count(sbi, foreground) do { } while (0) +#define stat_inc_gc_sec_count(sbi, type, gc_type) do { } while (0) +#define stat_inc_gc_seg_count(sbi, type, gc_type) do { } while (0) #define stat_inc_tot_blk_count(si, blks) do { } while (0) #define stat_inc_data_blk_count(sbi, blks, gc_type) do { } while (0) #define stat_inc_node_blk_count(sbi, blks, gc_type) do { } while (0) @@ -4423,6 +4434,22 @@ static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, } #endif +static inline int f2fs_bdev_index(struct f2fs_sb_info *sbi, + struct block_device *bdev) +{ + int i; + + if (!f2fs_is_multi_device(sbi)) + return 0; + + for (i = 0; i < sbi->s_ndevs; i++) + if (FDEV(i).bdev == bdev) + return i; + + WARN_ON(1); + return -1; +} + static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi) { return f2fs_sb_has_blkzoned(sbi); @@ -4483,7 +4510,8 @@ static inline bool f2fs_low_mem_mode(struct f2fs_sb_info *sbi) static inline bool f2fs_may_compress(struct inode *inode) { if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) || - f2fs_is_atomic_file(inode) || f2fs_has_inline_data(inode)) + f2fs_is_atomic_file(inode) || f2fs_has_inline_data(inode) || + f2fs_is_mmap_file(inode)) return false; return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 093039dee992..ca5904129b16 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -159,7 +159,7 @@ out_sem: sb_end_pagefault(inode->i_sb); err: - return block_page_mkwrite_return(err); + return vmf_fs_error(err); } static const struct vm_operations_struct f2fs_file_vm_ops = { @@ -526,7 +526,11 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); vma->vm_ops = &f2fs_file_vm_ops; + + f2fs_down_read(&F2FS_I(inode)->i_sem); set_inode_flag(inode, FI_MMAP_FILE); + f2fs_up_read(&F2FS_I(inode)->i_sem); + return 0; } @@ -794,7 +798,7 @@ int f2fs_truncate(struct inode *inode) if (err) return err; - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); f2fs_mark_inode_dirty_sync(inode, false); return 0; } @@ -882,7 +886,7 @@ int f2fs_getattr(struct mnt_idmap *idmap, const struct path *path, STATX_ATTR_NODUMP | STATX_ATTR_VERITY); - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); /* we need to show initial sectors used for inline_data/dentries */ if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) || @@ -905,7 +909,7 @@ static void __setattr_copy(struct mnt_idmap *idmap, if (ia_valid & ATTR_MTIME) inode->i_mtime = attr->ia_mtime; if (ia_valid & ATTR_CTIME) - inode->i_ctime = attr->ia_ctime; + inode_set_ctime_to_ts(inode, attr->ia_ctime); if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); @@ -1008,7 +1012,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, return err; spin_lock(&F2FS_I(inode)->i_size_lock); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); F2FS_I(inode)->last_disk_size = i_size_read(inode); spin_unlock(&F2FS_I(inode)->i_size_lock); } @@ -1724,6 +1728,7 @@ next_alloc: if (has_not_enough_free_secs(sbi, 0, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { f2fs_down_write(&sbi->gc_lock); + stat_inc_gc_call_count(sbi, FOREGROUND); err = f2fs_gc(sbi, &gc_control); if (err && err != -ENODATA) goto out_err; @@ -1835,7 +1840,7 @@ static long f2fs_fallocate(struct file *file, int mode, } if (!ret) { - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); f2fs_mark_inode_dirty_sync(inode, false); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } @@ -1919,12 +1924,19 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) int err = f2fs_convert_inline_inode(inode); if (err) return err; - if (!f2fs_may_compress(inode)) - return -EINVAL; - if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode)) + + f2fs_down_write(&F2FS_I(inode)->i_sem); + if (!f2fs_may_compress(inode) || + (S_ISREG(inode->i_mode) && + F2FS_HAS_BLOCKS(inode))) { + f2fs_up_write(&F2FS_I(inode)->i_sem); return -EINVAL; - if (set_compress_context(inode)) - return -EOPNOTSUPP; + } + err = set_compress_context(inode); + f2fs_up_write(&F2FS_I(inode)->i_sem); + + if (err) + return err; } } @@ -1937,7 +1949,7 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) else clear_inode_flag(inode, FI_PROJ_INHERIT); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); f2fs_set_inode_flags(inode); f2fs_mark_inode_dirty_sync(inode, true); return 0; @@ -2465,6 +2477,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) gc_control.init_gc_type = sync ? FG_GC : BG_GC; gc_control.err_gc_skipped = sync; + stat_inc_gc_call_count(sbi, FOREGROUND); ret = f2fs_gc(sbi, &gc_control); out: mnt_drop_write_file(filp); @@ -2508,6 +2521,7 @@ do_more: } gc_control.victim_segno = GET_SEGNO(sbi, range->start); + stat_inc_gc_call_count(sbi, FOREGROUND); ret = f2fs_gc(sbi, &gc_control); if (ret) { if (ret == -EBUSY) @@ -2874,10 +2888,10 @@ out_src: if (ret) goto out_unlock; - src->i_mtime = src->i_ctime = current_time(src); + src->i_mtime = inode_set_ctime_current(src); f2fs_mark_inode_dirty_sync(src, false); if (src != dst) { - dst->i_mtime = dst->i_ctime = current_time(dst); + dst->i_mtime = inode_set_ctime_current(dst); f2fs_mark_inode_dirty_sync(dst, false); } f2fs_update_time(sbi, REQ_TIME); @@ -2990,6 +3004,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) sm->last_victim[ALLOC_NEXT] = end_segno + 1; gc_control.victim_segno = start_segno; + stat_inc_gc_call_count(sbi, FOREGROUND); ret = f2fs_gc(sbi, &gc_control); if (ret == -EAGAIN) ret = 0; @@ -3073,7 +3088,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) goto out_unlock; fi->i_projid = kprojid; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); f2fs_mark_inode_dirty_sync(inode, true); out_unlock: f2fs_unlock_op(sbi); @@ -3511,7 +3526,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) } set_inode_flag(inode, FI_COMPRESS_RELEASED); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); f2fs_mark_inode_dirty_sync(inode, true); f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -3710,7 +3725,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) if (ret >= 0) { clear_inode_flag(inode, FI_COMPRESS_RELEASED); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); f2fs_mark_inode_dirty_sync(inode, true); } unlock_inode: @@ -3976,6 +3991,7 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg) file_start_write(filp); inode_lock(inode); + f2fs_down_write(&F2FS_I(inode)->i_sem); if (f2fs_is_mmap_file(inode) || get_dirty_pages(inode)) { ret = -EBUSY; goto out; @@ -3995,6 +4011,7 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg) f2fs_warn(sbi, "compression algorithm is successfully set, " "but current kernel doesn't support this algorithm."); out: + f2fs_up_write(&F2FS_I(inode)->i_sem); inode_unlock(inode); file_end_write(filp); @@ -4079,10 +4096,8 @@ static int f2fs_ioc_decompress_file(struct file *filp) last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); count = last_idx - page_idx; - while (count) { - int len = min(cluster_size, count); - - ret = redirty_blocks(inode, page_idx, len); + while (count && count >= cluster_size) { + ret = redirty_blocks(inode, page_idx, cluster_size); if (ret < 0) break; @@ -4092,8 +4107,14 @@ static int f2fs_ioc_decompress_file(struct file *filp) break; } - count -= len; - page_idx += len; + count -= cluster_size; + page_idx += cluster_size; + + cond_resched(); + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } } if (!ret) @@ -4153,10 +4174,8 @@ static int f2fs_ioc_compress_file(struct file *filp) last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); count = last_idx - page_idx; - while (count) { - int len = min(cluster_size, count); - - ret = redirty_blocks(inode, page_idx, len); + while (count && count >= cluster_size) { + ret = redirty_blocks(inode, page_idx, cluster_size); if (ret < 0) break; @@ -4166,8 +4185,14 @@ static int f2fs_ioc_compress_file(struct file *filp) break; } - count -= len; - page_idx += len; + count -= cluster_size; + page_idx += cluster_size; + + cond_resched(); + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } } if (!ret) @@ -4579,6 +4604,7 @@ static int f2fs_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, dec_page_count(sbi, F2FS_DIO_WRITE); if (error) return error; + f2fs_update_time(sbi, REQ_TIME); f2fs_update_iostat(sbi, NULL, APP_DIRECT_IO, size); return 0; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 01effd3fcb6c..f550cdeaa663 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -121,8 +121,8 @@ static int gc_thread_func(void *data) else increase_sleep_time(gc_th, &wait_ms); do_gc: - if (!foreground) - stat_inc_bggc_count(sbi->stat_info); + stat_inc_gc_call_count(sbi, foreground ? + FOREGROUND : BACKGROUND); sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC; @@ -1685,6 +1685,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, int seg_freed = 0, migrated = 0; unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? SUM_TYPE_DATA : SUM_TYPE_NODE; + unsigned char data_type = (type == SUM_TYPE_DATA) ? DATA : NODE; int submitted = 0; if (__is_large_section(sbi)) @@ -1766,7 +1767,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, segno, gc_type, force_migrate); - stat_inc_seg_count(sbi, type, gc_type); + stat_inc_gc_seg_count(sbi, data_type, gc_type); sbi->gc_reclaimed_segs[sbi->gc_mode]++; migrated++; @@ -1783,12 +1784,12 @@ skip: } if (submitted) - f2fs_submit_merged_write(sbi, - (type == SUM_TYPE_NODE) ? NODE : DATA); + f2fs_submit_merged_write(sbi, data_type); blk_finish_plug(&plug); - stat_inc_call_count(sbi->stat_info); + if (migrated) + stat_inc_gc_sec_count(sbi, data_type, gc_type); return seg_freed; } @@ -1839,6 +1840,7 @@ gc_more: * secure free segments which doesn't need fggc any more. */ if (prefree_segments(sbi)) { + stat_inc_cp_call_count(sbi, TOTAL_CALL); ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) goto stop; @@ -1887,6 +1889,7 @@ retry: round++; if (skipped_round > MAX_SKIP_GC_COUNT && skipped_round * 2 >= round) { + stat_inc_cp_call_count(sbi, TOTAL_CALL); ret = f2fs_write_checkpoint(sbi, &cpc); goto stop; } @@ -1902,6 +1905,7 @@ retry: */ if (free_sections(sbi) <= upper_secs + NR_GC_CHECKPOINT_SECS && prefree_segments(sbi)) { + stat_inc_cp_call_count(sbi, TOTAL_CALL); ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) goto stop; @@ -2029,6 +2033,7 @@ static int free_segment_range(struct f2fs_sb_info *sbi, if (gc_only) goto out; + stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); if (err) goto out; @@ -2181,12 +2186,14 @@ out_drop_write: if (err) return err; - err = freeze_super(sbi->sb); + err = freeze_super(sbi->sb, FREEZE_HOLDER_USERSPACE); if (err) return err; if (f2fs_readonly(sbi->sb)) { - thaw_super(sbi->sb); + err = thaw_super(sbi->sb, FREEZE_HOLDER_USERSPACE); + if (err) + return err; return -EROFS; } @@ -2221,6 +2228,7 @@ out_drop_write: clear_sbi_flag(sbi, SBI_IS_RESIZEFS); set_sbi_flag(sbi, SBI_IS_DIRTY); + stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); if (err) { update_fs_metadata(sbi, secs); @@ -2240,6 +2248,6 @@ recover_out: out_err: f2fs_up_write(&sbi->cp_global_sem); f2fs_up_write(&sbi->gc_lock); - thaw_super(sbi->sb); + thaw_super(sbi->sb, FREEZE_HOLDER_USERSPACE); return err; } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 4638fee16a91..2fe25619ccb5 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -641,7 +641,8 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, } if (inode) { - f2fs_down_write(&F2FS_I(inode)->i_sem); + f2fs_down_write_nested(&F2FS_I(inode)->i_sem, + SINGLE_DEPTH_NESTING); page = f2fs_init_inode_metadata(inode, dir, fname, ipage); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -698,7 +699,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, set_page_dirty(page); f2fs_put_page(page, 1); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); f2fs_mark_inode_dirty_sync(dir, false); if (inode) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 09e986b050c6..cde243840abd 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -214,7 +214,7 @@ static bool sanity_check_compress_inode(struct inode *inode, f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported compress algorithm: %u, run fsck to fix", __func__, inode->i_ino, ri->i_compress_algorithm); - goto err; + return false; } if (le64_to_cpu(ri->i_compr_blocks) > SECTOR_TO_BLOCK(inode->i_blocks)) { @@ -222,14 +222,14 @@ static bool sanity_check_compress_inode(struct inode *inode, "%s: inode (ino=%lx) has inconsistent i_compr_blocks:%llu, i_blocks:%llu, run fsck to fix", __func__, inode->i_ino, le64_to_cpu(ri->i_compr_blocks), SECTOR_TO_BLOCK(inode->i_blocks)); - goto err; + return false; } if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE || ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) { f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported log cluster size: %u, run fsck to fix", __func__, inode->i_ino, ri->i_log_cluster_size); - goto err; + return false; } clevel = le16_to_cpu(ri->i_compress_flag) >> @@ -273,8 +273,6 @@ static bool sanity_check_compress_inode(struct inode *inode, err_level: f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported compress level: %u, run fsck to fix", __func__, inode->i_ino, clevel); -err: - set_sbi_flag(sbi, SBI_NEED_FSCK); return false; } @@ -287,14 +285,12 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) iblocks = le64_to_cpu(F2FS_INODE(node_page)->i_blocks); if (!iblocks) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: corrupted inode i_blocks i_ino=%lx iblocks=%llu, run fsck to fix.", __func__, inode->i_ino, iblocks); return false; } if (ino_of_node(node_page) != nid_of_node(node_page)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: corrupted inode footer i_ino=%lx, ino,nid: [%u, %u] run fsck to fix.", __func__, inode->i_ino, ino_of_node(node_page), nid_of_node(node_page)); @@ -303,7 +299,6 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) if (f2fs_has_extra_attr(inode)) { if (!f2fs_sb_has_extra_attr(sbi)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx) is with extra_attr, but extra_attr feature is off", __func__, inode->i_ino); return false; @@ -311,7 +306,6 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) if (fi->i_extra_isize > F2FS_TOTAL_EXTRA_ATTR_SIZE || fi->i_extra_isize < F2FS_MIN_EXTRA_ATTR_SIZE || fi->i_extra_isize % sizeof(__le32)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_extra_isize: %d, max: %zu", __func__, inode->i_ino, fi->i_extra_isize, F2FS_TOTAL_EXTRA_ATTR_SIZE); @@ -321,7 +315,6 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) f2fs_has_inline_xattr(inode) && (!fi->i_inline_xattr_size || fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, max: %zu", __func__, inode->i_ino, fi->i_inline_xattr_size, MAX_INLINE_XATTR_SIZE); @@ -335,7 +328,6 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } } else if (f2fs_sb_has_flexible_inline_xattr(sbi)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: corrupted inode ino=%lx, run fsck to fix.", __func__, inode->i_ino); return false; @@ -343,31 +335,26 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) if (!f2fs_sb_has_extra_attr(sbi)) { if (f2fs_sb_has_project_quota(sbi)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", __func__, inode->i_ino, F2FS_FEATURE_PRJQUOTA); return false; } if (f2fs_sb_has_inode_chksum(sbi)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", __func__, inode->i_ino, F2FS_FEATURE_INODE_CHKSUM); return false; } if (f2fs_sb_has_flexible_inline_xattr(sbi)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", __func__, inode->i_ino, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR); return false; } if (f2fs_sb_has_inode_crtime(sbi)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", __func__, inode->i_ino, F2FS_FEATURE_INODE_CRTIME); return false; } if (f2fs_sb_has_compression(sbi)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", __func__, inode->i_ino, F2FS_FEATURE_COMPRESSION); return false; @@ -375,21 +362,18 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) } if (f2fs_sanity_check_inline_data(inode)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix", __func__, inode->i_ino, inode->i_mode); return false; } if (f2fs_has_inline_dentry(inode) && !S_ISDIR(inode->i_mode)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_dentry, run fsck to fix", __func__, inode->i_ino, inode->i_mode); return false; } if ((fi->i_flags & F2FS_CASEFOLD_FL) && !f2fs_sb_has_casefold(sbi)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx) has casefold flag, but casefold feature is off", __func__, inode->i_ino); return false; @@ -403,7 +387,7 @@ static void init_idisk_time(struct inode *inode) struct f2fs_inode_info *fi = F2FS_I(inode); fi->i_disk_time[0] = inode->i_atime; - fi->i_disk_time[1] = inode->i_ctime; + fi->i_disk_time[1] = inode_get_ctime(inode); fi->i_disk_time[2] = inode->i_mtime; } @@ -434,10 +418,10 @@ static int do_read_inode(struct inode *inode) inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks) - 1); inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime); - inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime); + inode_set_ctime(inode, le64_to_cpu(ri->i_ctime), + le32_to_cpu(ri->i_ctime_nsec)); inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime); inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec); - inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); inode->i_generation = le32_to_cpu(ri->i_generation); if (S_ISDIR(inode->i_mode)) @@ -475,6 +459,13 @@ static int do_read_inode(struct inode *inode) fi->i_inline_xattr_size = 0; } + if (!sanity_check_inode(inode, node_page)) { + f2fs_put_page(node_page, 1); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); + return -EFSCORRUPTED; + } + /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) __recover_inline_status(inode, node_page); @@ -544,12 +535,6 @@ static int do_read_inode(struct inode *inode) f2fs_init_read_extent_tree(inode, node_page); f2fs_init_age_extent_tree(inode); - if (!sanity_check_inode(inode, node_page)) { - f2fs_put_page(node_page, 1); - f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); - return -EFSCORRUPTED; - } - if (!sanity_check_extent_cache(inode)) { f2fs_put_page(node_page, 1); f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); @@ -714,10 +699,10 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) set_raw_inline(inode, ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); - ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + ri->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); - ri->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ri->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); if (S_ISDIR(inode->i_mode)) ri->i_current_depth = diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index bee0568888da..193b22a2d6bf 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -243,7 +243,7 @@ static struct inode *f2fs_new_inode(struct mnt_idmap *idmap, inode->i_ino = ino; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); F2FS_I(inode)->i_crtime = inode->i_mtime; inode->i_generation = get_random_u32(); @@ -420,7 +420,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, f2fs_balance_fs(sbi, true); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); ihold(inode); set_inode_flag(inode, FI_INC_LINK); @@ -1052,7 +1052,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, f2fs_set_link(new_dir, new_entry, new_page, old_inode); new_page = NULL; - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); f2fs_down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) f2fs_i_links_write(new_inode, false); @@ -1086,7 +1086,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, f2fs_i_pino_write(old_inode, new_dir->i_ino); f2fs_up_write(&F2FS_I(old_inode)->i_sem); - old_inode->i_ctime = current_time(old_inode); + inode_set_ctime_current(old_inode); f2fs_mark_inode_dirty_sync(old_inode, false); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); @@ -1251,7 +1251,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_pino_write(old_inode, new_dir->i_ino); f2fs_up_write(&F2FS_I(old_inode)->i_sem); - old_dir->i_ctime = current_time(old_dir); + inode_set_ctime_current(old_dir); if (old_nlink) { f2fs_down_write(&F2FS_I(old_dir)->i_sem); f2fs_i_links_write(old_dir, old_nlink > 0); @@ -1270,7 +1270,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_pino_write(new_inode, old_dir->i_ino); f2fs_up_write(&F2FS_I(new_inode)->i_sem); - new_dir->i_ctime = current_time(new_dir); + inode_set_ctime_current(new_dir); if (new_nlink) { f2fs_down_write(&F2FS_I(new_dir)->i_sem); f2fs_i_links_write(new_dir, new_nlink > 0); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 4e7d4ceeb084..7be60df277a5 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -321,10 +321,10 @@ static int recover_inode(struct inode *inode, struct page *page) f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime); - inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); + inode_set_ctime(inode, le64_to_cpu(raw->i_ctime), + le32_to_cpu(raw->i_ctime_nsec)); inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime); inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec); - inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); F2FS_I(inode)->i_advise = raw->i_advise; @@ -924,6 +924,7 @@ skip: struct cp_control cpc = { .reason = CP_RECOVERY, }; + stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); } } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0457d620011f..d05b41608fc0 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -205,6 +205,8 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean) f2fs_i_size_write(inode, fi->original_i_size); fi->original_i_size = 0; } + /* avoid stale dirty inode during eviction */ + sync_inode_metadata(inode, 0); } static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, @@ -433,6 +435,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) .err_gc_skipped = false, .nr_free_secs = 1 }; f2fs_down_write(&sbi->gc_lock); + stat_inc_gc_call_count(sbi, FOREGROUND); f2fs_gc(sbi, &gc_control); } } @@ -510,8 +513,8 @@ do_sync: mutex_unlock(&sbi->flush_lock); } + stat_inc_cp_call_count(sbi, BACKGROUND); f2fs_sync_fs(sbi->sb, 1); - stat_inc_bg_cp_count(sbi->stat_info); } static int __submit_flush_wait(struct f2fs_sb_info *sbi, @@ -1258,8 +1261,16 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev)) { - __submit_zone_reset_cmd(sbi, dc, flag, wait_list, issued); - return 0; + int devi = f2fs_bdev_index(sbi, bdev); + + if (devi < 0) + return -EINVAL; + + if (f2fs_blkz_is_seq(sbi, devi, dc->di.start)) { + __submit_zone_reset_cmd(sbi, dc, flag, + wait_list, issued); + return 0; + } } #endif @@ -1785,15 +1796,24 @@ static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) dc = __lookup_discard_cmd(sbi, blkaddr); #ifdef CONFIG_BLK_DEV_ZONED if (dc && f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(dc->bdev)) { - /* force submit zone reset */ - if (dc->state == D_PREP) - __submit_zone_reset_cmd(sbi, dc, REQ_SYNC, - &dcc->wait_list, NULL); - dc->ref++; - mutex_unlock(&dcc->cmd_lock); - /* wait zone reset */ - __wait_one_discard_bio(sbi, dc); - return; + int devi = f2fs_bdev_index(sbi, dc->bdev); + + if (devi < 0) { + mutex_unlock(&dcc->cmd_lock); + return; + } + + if (f2fs_blkz_is_seq(sbi, devi, dc->di.start)) { + /* force submit zone reset */ + if (dc->state == D_PREP) + __submit_zone_reset_cmd(sbi, dc, REQ_SYNC, + &dcc->wait_list, NULL); + dc->ref++; + mutex_unlock(&dcc->cmd_lock); + /* wait zone reset */ + __wait_one_discard_bio(sbi, dc); + return; + } } #endif if (dc) { @@ -2193,7 +2213,7 @@ find_next: len = next_pos - cur_pos; if (f2fs_sb_has_blkzoned(sbi) || - !force || len < cpc->trim_minlen) + (force && len < cpc->trim_minlen)) goto skip; f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, @@ -3228,6 +3248,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) goto out; f2fs_down_write(&sbi->gc_lock); + stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); f2fs_up_write(&sbi->gc_lock); if (err) @@ -4846,17 +4867,17 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, { unsigned int wp_segno, wp_blkoff, zone_secno, zone_segno, segno; block_t zone_block, wp_block, last_valid_block; + unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; int i, s, b, ret; struct seg_entry *se; if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) return 0; - wp_block = fdev->start_blk + (zone->wp >> sbi->log_sectors_per_block); + wp_block = fdev->start_blk + (zone->wp >> log_sectors_per_block); wp_segno = GET_SEGNO(sbi, wp_block); wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); - zone_block = fdev->start_blk + (zone->start >> - sbi->log_sectors_per_block); + zone_block = fdev->start_blk + (zone->start >> log_sectors_per_block); zone_segno = GET_SEGNO(sbi, zone_block); zone_secno = GET_SEC_FROM_SEG(sbi, zone_segno); @@ -4906,7 +4927,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, "pointer. Reset the write pointer: wp[0x%x,0x%x]", wp_segno, wp_blkoff); ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block, - zone->len >> sbi->log_sectors_per_block); + zone->len >> log_sectors_per_block); if (ret) f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", fdev->path, ret); @@ -4927,12 +4948,19 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, GET_BLKOFF_FROM_SEG0(sbi, last_valid_block), wp_segno, wp_blkoff); - ret = blkdev_issue_zeroout(fdev->bdev, zone->wp, - zone->len - (zone->wp - zone->start), - GFP_NOFS, 0); - if (ret) - f2fs_err(sbi, "Fill up zone failed: %s (errno=%d)", - fdev->path, ret); + ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH, + zone->start, zone->len, GFP_NOFS); + if (ret == -EOPNOTSUPP) { + ret = blkdev_issue_zeroout(fdev->bdev, zone->wp, + zone->len - (zone->wp - zone->start), + GFP_NOFS, 0); + if (ret) + f2fs_err(sbi, "Fill up zone failed: %s (errno=%d)", + fdev->path, ret); + } else if (ret) { + f2fs_err(sbi, "Finishing zone failed: %s (errno=%d)", + fdev->path, ret); + } return ret; } @@ -4967,6 +4995,7 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) struct blk_zone zone; unsigned int cs_section, wp_segno, wp_blkoff, wp_sector_off; block_t cs_zone_block, wp_block; + unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; sector_t zone_sector; int err; @@ -4978,8 +5007,8 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) return 0; /* report zone for the sector the curseg points to */ - zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) << - sbi->log_sectors_per_block; + zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) + << log_sectors_per_block; err = blkdev_report_zones(zbd->bdev, zone_sector, 1, report_one_zone_cb, &zone); if (err != 1) { @@ -4991,10 +5020,10 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ) return 0; - wp_block = zbd->start_blk + (zone.wp >> sbi->log_sectors_per_block); + wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block); wp_segno = GET_SEGNO(sbi, wp_block); wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); - wp_sector_off = zone.wp & GENMASK(sbi->log_sectors_per_block - 1, 0); + wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0); if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff && wp_sector_off == 0) @@ -5021,8 +5050,8 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) if (!zbd) return 0; - zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) << - sbi->log_sectors_per_block; + zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) + << log_sectors_per_block; err = blkdev_report_zones(zbd->bdev, zone_sector, 1, report_one_zone_cb, &zone); if (err != 1) { @@ -5040,7 +5069,7 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) "Reset the zone: curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff); err = __f2fs_issue_discard_zone(sbi, zbd->bdev, cs_zone_block, - zone.len >> sbi->log_sectors_per_block); + zone.len >> log_sectors_per_block); if (err) { f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", zbd->path, err); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ca31163da00a..a8c8232852bb 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -591,7 +591,7 @@ static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) unsigned int level; if (strlen(str) == 3) { - F2FS_OPTION(sbi).compress_level = LZ4HC_DEFAULT_CLEVEL; + F2FS_OPTION(sbi).compress_level = 0; return 0; } @@ -862,11 +862,6 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) if (!name) return -ENOMEM; if (!strcmp(name, "adaptive")) { - if (f2fs_sb_has_blkzoned(sbi)) { - f2fs_warn(sbi, "adaptive mode is not allowed with zoned block device feature"); - kfree(name); - return -EINVAL; - } F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; } else if (!strcmp(name, "lfs")) { F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; @@ -1331,6 +1326,11 @@ default_check: F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_SECTION; } + + if (F2FS_OPTION(sbi).fs_mode != FS_MODE_LFS) { + f2fs_info(sbi, "Only lfs mode is allowed with zoned block device feature"); + return -EINVAL; + } #else f2fs_err(sbi, "Zoned block device support is not enabled"); return -EINVAL; @@ -1561,7 +1561,8 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) int i; for (i = 0; i < sbi->s_ndevs; i++) { - blkdev_put(FDEV(i).bdev, sbi->sb->s_type); + if (i > 0) + blkdev_put(FDEV(i).bdev, sbi->sb); #ifdef CONFIG_BLK_DEV_ZONED kvfree(FDEV(i).blkz_seq); #endif @@ -1600,6 +1601,7 @@ static void f2fs_put_super(struct super_block *sb) struct cp_control cpc = { .reason = CP_UMOUNT, }; + stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); } @@ -1609,6 +1611,7 @@ static void f2fs_put_super(struct super_block *sb) struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; + stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); } @@ -1705,8 +1708,10 @@ int f2fs_sync_fs(struct super_block *sb, int sync) if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return -EAGAIN; - if (sync) + if (sync) { + stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_issue_checkpoint(sbi); + } return err; } @@ -2205,6 +2210,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) .nr_free_secs = 1 }; f2fs_down_write(&sbi->gc_lock); + stat_inc_gc_call_count(sbi, FOREGROUND); err = f2fs_gc(sbi, &gc_control); if (err == -ENODATA) { err = 0; @@ -2230,6 +2236,7 @@ skip_gc: f2fs_down_write(&sbi->gc_lock); cpc.reason = CP_PAUSE; set_sbi_flag(sbi, SBI_CP_DISABLED); + stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); if (err) goto out_unlock; @@ -2703,7 +2710,7 @@ retry: if (len == towrite) return err; - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); f2fs_mark_inode_dirty_sync(inode, false); return len - towrite; } @@ -4190,16 +4197,12 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) sbi->aligned_blksize = true; for (i = 0; i < max_devices; i++) { - - if (i > 0 && !RDEV(i).path[0]) + if (i == 0) + FDEV(0).bdev = sbi->sb->s_bdev; + else if (!RDEV(i).path[0]) break; - if (max_devices == 1) { - /* Single zoned block device mount */ - FDEV(0).bdev = - blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev, mode, - sbi->sb->s_type, NULL); - } else { + if (max_devices > 1) { /* Multi-device mount */ memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); FDEV(i).total_segments = @@ -4215,10 +4218,9 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) FDEV(i).end_blk = FDEV(i).start_blk + (FDEV(i).total_segments << sbi->log_blocks_per_seg) - 1; + FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, + mode, sbi->sb, NULL); } - FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, mode, - sbi->sb->s_type, - NULL); } if (IS_ERR(FDEV(i).bdev)) return PTR_ERR(FDEV(i).bdev); @@ -4871,6 +4873,7 @@ static void kill_f2fs_super(struct super_block *sb) struct cp_control cpc = { .reason = CP_UMOUNT, }; + stat_inc_cp_call_count(sbi, TOTAL_CALL); f2fs_write_checkpoint(sbi, &cpc); } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 48b7e0073884..417fae96890f 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -356,6 +356,16 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, if (!strcmp(a->attr.name, "revoked_atomic_block")) return sysfs_emit(buf, "%llu\n", sbi->revoked_atomic_block); +#ifdef CONFIG_F2FS_STAT_FS + if (!strcmp(a->attr.name, "cp_foreground_calls")) + return sysfs_emit(buf, "%d\n", + atomic_read(&sbi->cp_call_count[TOTAL_CALL]) - + atomic_read(&sbi->cp_call_count[BACKGROUND])); + if (!strcmp(a->attr.name, "cp_background_calls")) + return sysfs_emit(buf, "%d\n", + atomic_read(&sbi->cp_call_count[BACKGROUND])); +#endif + ui = (unsigned int *)(ptr + a->offset); return sysfs_emit(buf, "%u\n", *ui); @@ -972,10 +982,10 @@ F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec); /* STAT_INFO ATTR */ #ifdef CONFIG_F2FS_STAT_FS -STAT_INFO_RO_ATTR(cp_foreground_calls, cp_count); -STAT_INFO_RO_ATTR(cp_background_calls, bg_cp_count); -STAT_INFO_RO_ATTR(gc_foreground_calls, call_count); -STAT_INFO_RO_ATTR(gc_background_calls, bg_gc); +STAT_INFO_RO_ATTR(cp_foreground_calls, cp_call_count[FOREGROUND]); +STAT_INFO_RO_ATTR(cp_background_calls, cp_call_count[BACKGROUND]); +STAT_INFO_RO_ATTR(gc_foreground_calls, gc_call_count[FOREGROUND]); +STAT_INFO_RO_ATTR(gc_background_calls, gc_call_count[BACKGROUND]); #endif /* FAULT_INFO ATTR */ diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 476b186b90a6..a657284faee3 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -757,17 +757,17 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); - f2fs_mark_inode_dirty_sync(inode, true); - if (!error && S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); same: if (is_inode_flag_set(inode, FI_ACL_MODE)) { inode->i_mode = F2FS_I(inode)->i_acl_mode; - inode->i_ctime = current_time(inode); clear_inode_flag(inode, FI_ACL_MODE); } + inode_set_ctime_current(inode); + f2fs_mark_inode_dirty_sync(inode, true); exit: kfree(base_addr); return error; diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig index afe83b4e7172..25fae1c83725 100644 --- a/fs/fat/Kconfig +++ b/fs/fat/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config FAT_FS tristate + select BUFFER_HEAD select NLS select LEGACY_DIRECT_IO help diff --git a/fs/fat/fat.h b/fs/fat/fat.h index e3b690b48e3e..66cf4778cf3b 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -460,8 +460,7 @@ extern struct timespec64 fat_truncate_mtime(const struct msdos_sb_info *sbi, const struct timespec64 *ts); extern int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags); -extern int fat_update_time(struct inode *inode, struct timespec64 *now, - int flags); +extern int fat_update_time(struct inode *inode, int flags); extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs); int fat_cache_init(void); diff --git a/fs/fat/file.c b/fs/fat/file.c index 456477946dd9..e887e9ab7472 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -401,7 +401,7 @@ int fat_getattr(struct mnt_idmap *idmap, const struct path *path, struct inode *inode = d_inode(path->dentry); struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); stat->blksize = sbi->cluster_size; if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) { diff --git a/fs/fat/inode.c b/fs/fat/inode.c index d99b8549ec8f..cdd39b6020f3 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -562,7 +562,7 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) & ~((loff_t)sbi->cluster_size - 1)) >> 9; fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0); - inode->i_ctime = inode->i_mtime; + inode_set_ctime_to_ts(inode, inode->i_mtime); if (sbi->options.isvfat) { fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0); fat_time_fat2unix(sbi, &MSDOS_I(inode)->i_crtime, de->ctime, @@ -1407,8 +1407,7 @@ static int fat_read_root(struct inode *inode) MSDOS_I(inode)->mmu_private = inode->i_size; fat_save_attrs(inode, ATTR_DIR); - inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0; - inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0; + inode->i_mtime = inode->i_atime = inode_set_ctime(inode, 0, 0); set_nlink(inode, fat_subdirs(inode)+2); return 0; diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 7e5d6ae305f2..f2304a1054aa 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -332,13 +332,14 @@ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags) * but ctime updates are ignored. */ if (flags & S_MTIME) - inode->i_mtime = inode->i_ctime = fat_truncate_mtime(sbi, now); + inode->i_mtime = inode_set_ctime_to_ts(inode, + fat_truncate_mtime(sbi, now)); return 0; } EXPORT_SYMBOL_GPL(fat_truncate_time); -int fat_update_time(struct inode *inode, struct timespec64 *now, int flags) +int fat_update_time(struct inode *inode, int flags) { int dirty_flags = 0; @@ -346,16 +347,13 @@ int fat_update_time(struct inode *inode, struct timespec64 *now, int flags) return 0; if (flags & (S_ATIME | S_CTIME | S_MTIME)) { - fat_truncate_time(inode, now, flags); + fat_truncate_time(inode, NULL, flags); if (inode->i_sb->s_flags & SB_LAZYTIME) dirty_flags |= I_DIRTY_TIME; else dirty_flags |= I_DIRTY_SYNC; } - if ((flags & S_VERSION) && inode_maybe_inc_iversion(inode, false)) - dirty_flags |= I_DIRTY_SYNC; - __mark_inode_dirty(inode, dirty_flags); return 0; } diff --git a/fs/fcntl.c b/fs/fcntl.c index b622be119706..e871009f6c88 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -34,7 +34,7 @@ #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) -static int setfl(int fd, struct file * filp, unsigned long arg) +static int setfl(int fd, struct file * filp, unsigned int arg) { struct inode * inode = file_inode(filp); int error = 0; @@ -112,11 +112,11 @@ void __f_setown(struct file *filp, struct pid *pid, enum pid_type type, } EXPORT_SYMBOL(__f_setown); -int f_setown(struct file *filp, unsigned long arg, int force) +int f_setown(struct file *filp, int who, int force) { enum pid_type type; struct pid *pid = NULL; - int who = arg, ret = 0; + int ret = 0; type = PIDTYPE_TGID; if (who < 0) { @@ -317,28 +317,29 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { void __user *argp = (void __user *)arg; + int argi = (int)arg; struct flock flock; long err = -EINVAL; switch (cmd) { case F_DUPFD: - err = f_dupfd(arg, filp, 0); + err = f_dupfd(argi, filp, 0); break; case F_DUPFD_CLOEXEC: - err = f_dupfd(arg, filp, O_CLOEXEC); + err = f_dupfd(argi, filp, O_CLOEXEC); break; case F_GETFD: err = get_close_on_exec(fd) ? FD_CLOEXEC : 0; break; case F_SETFD: err = 0; - set_close_on_exec(fd, arg & FD_CLOEXEC); + set_close_on_exec(fd, argi & FD_CLOEXEC); break; case F_GETFL: err = filp->f_flags; break; case F_SETFL: - err = setfl(fd, filp, arg); + err = setfl(fd, filp, argi); break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ @@ -375,7 +376,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, force_successful_syscall_return(); break; case F_SETOWN: - err = f_setown(filp, arg, 1); + err = f_setown(filp, argi, 1); break; case F_GETOWN_EX: err = f_getown_ex(filp, arg); @@ -391,28 +392,28 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, break; case F_SETSIG: /* arg == 0 restores default behaviour. */ - if (!valid_signal(arg)) { + if (!valid_signal(argi)) { break; } err = 0; - filp->f_owner.signum = arg; + filp->f_owner.signum = argi; break; case F_GETLEASE: err = fcntl_getlease(filp); break; case F_SETLEASE: - err = fcntl_setlease(fd, filp, arg); + err = fcntl_setlease(fd, filp, argi); break; case F_NOTIFY: - err = fcntl_dirnotify(fd, filp, arg); + err = fcntl_dirnotify(fd, filp, argi); break; case F_SETPIPE_SZ: case F_GETPIPE_SZ: - err = pipe_fcntl(filp, cmd, arg); + err = pipe_fcntl(filp, cmd, argi); break; case F_ADD_SEALS: case F_GET_SEALS: - err = memfd_fcntl(filp, cmd, arg); + err = memfd_fcntl(filp, cmd, argi); break; case F_GET_RW_HINT: case F_SET_RW_HINT: diff --git a/fs/file.c b/fs/file.c index 3fd003a8604f..3e4a4dfa38fc 100644 --- a/fs/file.c +++ b/fs/file.c @@ -668,7 +668,7 @@ EXPORT_SYMBOL(close_fd); /* for ksys_close() */ /** * last_fd - return last valid index into fd table - * @cur_fds: files struct + * @fdt: File descriptor table. * * Context: Either rcu read lock or files_lock must be held. * @@ -693,29 +693,30 @@ static inline void __range_cloexec(struct files_struct *cur_fds, spin_unlock(&cur_fds->file_lock); } -static inline void __range_close(struct files_struct *cur_fds, unsigned int fd, +static inline void __range_close(struct files_struct *files, unsigned int fd, unsigned int max_fd) { + struct file *file; unsigned n; - rcu_read_lock(); - n = last_fd(files_fdtable(cur_fds)); - rcu_read_unlock(); + spin_lock(&files->file_lock); + n = last_fd(files_fdtable(files)); max_fd = min(max_fd, n); - while (fd <= max_fd) { - struct file *file; - - spin_lock(&cur_fds->file_lock); - file = pick_file(cur_fds, fd++); - spin_unlock(&cur_fds->file_lock); - + for (; fd <= max_fd; fd++) { + file = pick_file(files, fd); if (file) { - /* found a valid file to close */ - filp_close(file, cur_fds); + spin_unlock(&files->file_lock); + filp_close(file, files); cond_resched(); + spin_lock(&files->file_lock); + } else if (need_resched()) { + spin_unlock(&files->file_lock); + cond_resched(); + spin_lock(&files->file_lock); } } + spin_unlock(&files->file_lock); } /** @@ -723,6 +724,7 @@ static inline void __range_close(struct files_struct *cur_fds, unsigned int fd, * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close + * @flags: CLOSE_RANGE flags. * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. diff --git a/fs/file_table.c b/fs/file_table.c index fc7d677ff5ad..ee21b3da9d08 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -461,11 +461,8 @@ void fput(struct file *file) */ void __fput_sync(struct file *file) { - if (atomic_long_dec_and_test(&file->f_count)) { - struct task_struct *task = current; - BUG_ON(!(task->flags & PF_KTHREAD)); + if (atomic_long_dec_and_test(&file->f_count)) __fput(file); - } } EXPORT_SYMBOL(fput); diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig index 0e2fc08f7de4..912107ebea6f 100644 --- a/fs/freevxfs/Kconfig +++ b/fs/freevxfs/Kconfig @@ -2,6 +2,7 @@ config VXFS_FS tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)" depends on BLOCK + select BUFFER_HEAD help FreeVxFS is a file system driver that support the VERITAS VxFS(TM) file system format. VERITAS VxFS(TM) is the standard file system diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index ceb6a12649ba..ac5d43b164b5 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -110,10 +110,9 @@ static inline void dip2vip_cpy(struct vxfs_sb_info *sbi, inode->i_size = vip->vii_size; inode->i_atime.tv_sec = vip->vii_atime; - inode->i_ctime.tv_sec = vip->vii_ctime; + inode_set_ctime(inode, vip->vii_ctime, 0); inode->i_mtime.tv_sec = vip->vii_mtime; inode->i_atime.tv_nsec = 0; - inode->i_ctime.tv_nsec = 0; inode->i_mtime.tv_nsec = 0; inode->i_blocks = vip->vii_blocks; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index aca4b4811394..c1af01b2c42d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1535,10 +1535,15 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, if (wbc->pages_skipped) { /* - * writeback is not making progress due to locked - * buffers. Skip this inode for now. + * Writeback is not making progress due to locked buffers. + * Skip this inode for now. Although having skipped pages + * is odd for clean inodes, it can happen for some + * filesystems so handle that gracefully. */ - redirty_tail_locked(inode, wb); + if (inode->i_state & I_DIRTY_ALL) + redirty_tail_locked(inode, wb); + else + inode_cgwb_move_to_attached(inode, wb); return; } @@ -1953,9 +1958,9 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb, struct inode *inode = wb_inode(wb->b_io.prev); struct super_block *sb = inode->i_sb; - if (!trylock_super(sb)) { + if (!super_trylock_shared(sb)) { /* - * trylock_super() may fail consistently due to + * super_trylock_shared() may fail consistently due to * s_umount being grabbed by someone else. Don't use * requeue_io() to avoid busy retrying the inode/sb. */ diff --git a/fs/fs_context.c b/fs/fs_context.c index 851214d1d013..a0ad7a0c4680 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -162,6 +162,10 @@ EXPORT_SYMBOL(vfs_parse_fs_param); /** * vfs_parse_fs_string - Convenience function to just parse a string. + * @fc: Filesystem context. + * @key: Parameter name. + * @value: Default value. + * @v_size: Maximum number of bytes in the value. */ int vfs_parse_fs_string(struct fs_context *fc, const char *key, const char *value, size_t v_size) @@ -189,7 +193,7 @@ EXPORT_SYMBOL(vfs_parse_fs_string); /** * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data - * @ctx: The superblock configuration to fill in. + * @fc: The superblock configuration to fill in. * @data: The data to parse * * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be @@ -315,10 +319,31 @@ struct fs_context *fs_context_for_reconfigure(struct dentry *dentry, } EXPORT_SYMBOL(fs_context_for_reconfigure); +/** + * fs_context_for_submount: allocate a new fs_context for a submount + * @type: file_system_type of the new context + * @reference: reference dentry from which to copy relevant info + * + * Allocate a new fs_context suitable for a submount. This also ensures that + * the fc->security object is inherited from @reference (if needed). + */ struct fs_context *fs_context_for_submount(struct file_system_type *type, struct dentry *reference) { - return alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT); + struct fs_context *fc; + int ret; + + fc = alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT); + if (IS_ERR(fc)) + return fc; + + ret = security_fs_context_submount(fc, reference->d_sb); + if (ret) { + put_fs_context(fc); + return ERR_PTR(ret); + } + + return fc; } EXPORT_SYMBOL(fs_context_for_submount); @@ -333,7 +358,7 @@ void fc_drop_locked(struct fs_context *fc) static void legacy_fs_context_free(struct fs_context *fc); /** - * vfs_dup_fc_config: Duplicate a filesystem context. + * vfs_dup_fs_context - Duplicate a filesystem context. * @src_fc: The context to copy. */ struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc) @@ -379,7 +404,9 @@ EXPORT_SYMBOL(vfs_dup_fs_context); /** * logfc - Log a message to a filesystem context - * @fc: The filesystem context to log to. + * @log: The filesystem context to log to, or NULL to use printk. + * @prefix: A string to prefix the output with, or NULL. + * @level: 'w' for a warning, 'e' for an error. Anything else is a notice. * @fmt: The format of the buffer. */ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...) @@ -692,6 +719,7 @@ void vfs_clean_context(struct fs_context *fc) security_free_mnt_opts(&fc->security); kfree(fc->source); fc->source = NULL; + fc->exclusive = false; fc->purpose = FS_CONTEXT_FOR_RECONFIGURE; fc->phase = FS_CONTEXT_AWAITING_RECONF; diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 04b3f5b9c629..64c2d0814ed6 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -62,7 +62,7 @@ void chroot_fs_refs(const struct path *old_root, const struct path *new_root) int count = 0; read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { task_lock(p); fs = p->fs; if (fs) { @@ -79,7 +79,7 @@ void chroot_fs_refs(const struct path *old_root, const struct path *new_root) spin_unlock(&fs->lock); } task_unlock(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); while (count--) path_put(old_root); diff --git a/fs/fsopen.c b/fs/fsopen.c index fc9d2d9fd234..ce03f6521c88 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -209,6 +209,72 @@ err: return ret; } +static int vfs_cmd_create(struct fs_context *fc, bool exclusive) +{ + struct super_block *sb; + int ret; + + if (fc->phase != FS_CONTEXT_CREATE_PARAMS) + return -EBUSY; + + if (!mount_capable(fc)) + return -EPERM; + + /* require the new mount api */ + if (exclusive && fc->ops == &legacy_fs_context_ops) + return -EOPNOTSUPP; + + fc->phase = FS_CONTEXT_CREATING; + fc->exclusive = exclusive; + + ret = vfs_get_tree(fc); + if (ret) { + fc->phase = FS_CONTEXT_FAILED; + return ret; + } + + sb = fc->root->d_sb; + ret = security_sb_kern_mount(sb); + if (unlikely(ret)) { + fc_drop_locked(fc); + fc->phase = FS_CONTEXT_FAILED; + return ret; + } + + /* vfs_get_tree() callchains will have grabbed @s_umount */ + up_write(&sb->s_umount); + fc->phase = FS_CONTEXT_AWAITING_MOUNT; + return 0; +} + +static int vfs_cmd_reconfigure(struct fs_context *fc) +{ + struct super_block *sb; + int ret; + + if (fc->phase != FS_CONTEXT_RECONF_PARAMS) + return -EBUSY; + + fc->phase = FS_CONTEXT_RECONFIGURING; + + sb = fc->root->d_sb; + if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { + fc->phase = FS_CONTEXT_FAILED; + return -EPERM; + } + + down_write(&sb->s_umount); + ret = reconfigure_super(fc); + up_write(&sb->s_umount); + if (ret) { + fc->phase = FS_CONTEXT_FAILED; + return ret; + } + + vfs_clean_context(fc); + return 0; +} + /* * Check the state and apply the configuration. Note that this function is * allowed to 'steal' the value by setting param->xxx to NULL before returning. @@ -216,7 +282,6 @@ err: static int vfs_fsconfig_locked(struct fs_context *fc, int cmd, struct fs_parameter *param) { - struct super_block *sb; int ret; ret = finish_clean_context(fc); @@ -224,39 +289,11 @@ static int vfs_fsconfig_locked(struct fs_context *fc, int cmd, return ret; switch (cmd) { case FSCONFIG_CMD_CREATE: - if (fc->phase != FS_CONTEXT_CREATE_PARAMS) - return -EBUSY; - if (!mount_capable(fc)) - return -EPERM; - fc->phase = FS_CONTEXT_CREATING; - ret = vfs_get_tree(fc); - if (ret) - break; - sb = fc->root->d_sb; - ret = security_sb_kern_mount(sb); - if (unlikely(ret)) { - fc_drop_locked(fc); - break; - } - up_write(&sb->s_umount); - fc->phase = FS_CONTEXT_AWAITING_MOUNT; - return 0; + return vfs_cmd_create(fc, false); + case FSCONFIG_CMD_CREATE_EXCL: + return vfs_cmd_create(fc, true); case FSCONFIG_CMD_RECONFIGURE: - if (fc->phase != FS_CONTEXT_RECONF_PARAMS) - return -EBUSY; - fc->phase = FS_CONTEXT_RECONFIGURING; - sb = fc->root->d_sb; - if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { - ret = -EPERM; - break; - } - down_write(&sb->s_umount); - ret = reconfigure_super(fc); - up_write(&sb->s_umount); - if (ret) - break; - vfs_clean_context(fc); - return 0; + return vfs_cmd_reconfigure(fc); default: if (fc->phase != FS_CONTEXT_CREATE_PARAMS && fc->phase != FS_CONTEXT_RECONF_PARAMS) @@ -264,8 +301,6 @@ static int vfs_fsconfig_locked(struct fs_context *fc, int cmd, return vfs_parse_fs_param(fc, param); } - fc->phase = FS_CONTEXT_FAILED; - return ret; } /** @@ -353,6 +388,7 @@ SYSCALL_DEFINE5(fsconfig, return -EINVAL; break; case FSCONFIG_CMD_CREATE: + case FSCONFIG_CMD_CREATE_EXCL: case FSCONFIG_CMD_RECONFIGURE: if (_key || _value || aux) return -EINVAL; diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 247ef4f76761..ab62e4624256 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -235,7 +235,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, inode->i_mode = mode; inode->i_uid = fc->user_id; inode->i_gid = fc->group_id; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); /* setting ->i_op to NULL is not allowed */ if (iop) inode->i_op = iop; diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 8e74f278a3f6..23904a6a9a96 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -784,8 +784,8 @@ static int fuse_dax_writepages(struct address_space *mapping, return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc); } -static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, - enum page_entry_size pe_size, bool write) +static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, unsigned int order, + bool write) { vm_fault_t ret; struct inode *inode = file_inode(vmf->vma->vm_file); @@ -809,7 +809,7 @@ retry: * to populate page cache or access memory we are trying to free. */ filemap_invalidate_lock_shared(inode->i_mapping); - ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops); + ret = dax_iomap_fault(vmf, order, &pfn, &error, &fuse_iomap_ops); if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) { error = 0; retry = true; @@ -818,7 +818,7 @@ retry: } if (ret & VM_FAULT_NEEDDSYNC) - ret = dax_finish_sync_fault(vmf, pe_size, pfn); + ret = dax_finish_sync_fault(vmf, order, pfn); filemap_invalidate_unlock_shared(inode->i_mapping); if (write) @@ -829,24 +829,22 @@ retry: static vm_fault_t fuse_dax_fault(struct vm_fault *vmf) { - return __fuse_dax_fault(vmf, PE_SIZE_PTE, - vmf->flags & FAULT_FLAG_WRITE); + return __fuse_dax_fault(vmf, 0, vmf->flags & FAULT_FLAG_WRITE); } -static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf, - enum page_entry_size pe_size) +static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf, unsigned int order) { - return __fuse_dax_fault(vmf, pe_size, vmf->flags & FAULT_FLAG_WRITE); + return __fuse_dax_fault(vmf, order, vmf->flags & FAULT_FLAG_WRITE); } static vm_fault_t fuse_dax_page_mkwrite(struct vm_fault *vmf) { - return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); + return __fuse_dax_fault(vmf, 0, true); } static vm_fault_t fuse_dax_pfn_mkwrite(struct vm_fault *vmf) { - return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); + return __fuse_dax_fault(vmf, 0, true); } static const struct vm_operations_struct fuse_dax_vm_ops = { diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index f67bef9d83c4..d707e6987da9 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -92,7 +92,7 @@ static void fuse_dentry_settime(struct dentry *dentry, u64 time) /* * Calculate the time in jiffies until a dentry/attributes are valid */ -static u64 time_to_jiffies(u64 sec, u32 nsec) +u64 fuse_time_to_jiffies(u64 sec, u32 nsec) { if (sec || nsec) { struct timespec64 ts = { @@ -112,17 +112,7 @@ static u64 time_to_jiffies(u64 sec, u32 nsec) void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o) { fuse_dentry_settime(entry, - time_to_jiffies(o->entry_valid, o->entry_valid_nsec)); -} - -static u64 attr_timeout(struct fuse_attr_out *o) -{ - return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); -} - -u64 entry_attr_timeout(struct fuse_entry_out *o) -{ - return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); + fuse_time_to_jiffies(o->entry_valid, o->entry_valid_nsec)); } void fuse_invalidate_attr_mask(struct inode *inode, u32 mask) @@ -265,8 +255,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) goto invalid; forget_all_cached_acls(inode); - fuse_change_attributes(inode, &outarg.attr, - entry_attr_timeout(&outarg), + fuse_change_attributes(inode, &outarg.attr, NULL, + ATTR_TIMEOUT(&outarg), attr_version); fuse_change_entry_timeout(entry, &outarg); } else if (inode) { @@ -360,10 +350,14 @@ int fuse_valid_type(int m) S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m); } +static bool fuse_valid_size(u64 size) +{ + return size <= LLONG_MAX; +} + bool fuse_invalid_attr(struct fuse_attr *attr) { - return !fuse_valid_type(attr->mode) || - attr->size > LLONG_MAX; + return !fuse_valid_type(attr->mode) || !fuse_valid_size(attr->size); } int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, @@ -399,7 +393,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name goto out_put_forget; *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, - &outarg->attr, entry_attr_timeout(outarg), + &outarg->attr, ATTR_TIMEOUT(outarg), attr_version); err = -ENOMEM; if (!*inode) { @@ -686,7 +680,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, ff->nodeid = outentry.nodeid; ff->open_flags = outopen.open_flags; inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, - &outentry.attr, entry_attr_timeout(&outentry), 0); + &outentry.attr, ATTR_TIMEOUT(&outentry), 0); if (!inode) { flags &= ~(O_CREAT | O_EXCL | O_TRUNC); fuse_sync_release(NULL, ff, flags); @@ -755,7 +749,8 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, if (err == -ENOSYS) { fc->no_create = 1; goto mknod; - } + } else if (err == -EEXIST) + fuse_invalidate_entry(entry); out_dput: dput(res); return err; @@ -813,7 +808,7 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, goto out_put_forget_req; inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, - &outarg.attr, entry_attr_timeout(&outarg), 0); + &outarg.attr, ATTR_TIMEOUT(&outarg), 0); if (!inode) { fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1); return -ENOMEM; @@ -835,6 +830,8 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, return 0; out_put_forget_req: + if (err == -EEXIST) + fuse_invalidate_entry(entry); kfree(forget); return err; } @@ -933,7 +930,7 @@ void fuse_flush_time_update(struct inode *inode) static void fuse_update_ctime_in_cache(struct inode *inode) { if (!IS_NOCMTIME(inode)) { - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty_sync(inode); fuse_flush_time_update(inode); } @@ -986,7 +983,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) if (!err) { fuse_dir_changed(dir); fuse_entry_unlinked(entry); - } else if (err == -EINTR) + } else if (err == -EINTR || err == -ENOENT) fuse_invalidate_entry(entry); return err; } @@ -1009,7 +1006,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) if (!err) { fuse_dir_changed(dir); fuse_entry_unlinked(entry); - } else if (err == -EINTR) + } else if (err == -EINTR || err == -ENOENT) fuse_invalidate_entry(entry); return err; } @@ -1050,7 +1047,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, /* newent will end up negative */ if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) fuse_entry_unlinked(newent); - } else if (err == -EINTR) { + } else if (err == -EINTR || err == -ENOENT) { /* If request was interrupted, DEITY only knows if the rename actually took place. If the invalidation fails (e.g. some process has CWD under the renamed @@ -1153,6 +1150,87 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, stat->blksize = 1 << blkbits; } +static void fuse_statx_to_attr(struct fuse_statx *sx, struct fuse_attr *attr) +{ + memset(attr, 0, sizeof(*attr)); + attr->ino = sx->ino; + attr->size = sx->size; + attr->blocks = sx->blocks; + attr->atime = sx->atime.tv_sec; + attr->mtime = sx->mtime.tv_sec; + attr->ctime = sx->ctime.tv_sec; + attr->atimensec = sx->atime.tv_nsec; + attr->mtimensec = sx->mtime.tv_nsec; + attr->ctimensec = sx->ctime.tv_nsec; + attr->mode = sx->mode; + attr->nlink = sx->nlink; + attr->uid = sx->uid; + attr->gid = sx->gid; + attr->rdev = new_encode_dev(MKDEV(sx->rdev_major, sx->rdev_minor)); + attr->blksize = sx->blksize; +} + +static int fuse_do_statx(struct inode *inode, struct file *file, + struct kstat *stat) +{ + int err; + struct fuse_attr attr; + struct fuse_statx *sx; + struct fuse_statx_in inarg; + struct fuse_statx_out outarg; + struct fuse_mount *fm = get_fuse_mount(inode); + u64 attr_version = fuse_get_attr_version(fm->fc); + FUSE_ARGS(args); + + memset(&inarg, 0, sizeof(inarg)); + memset(&outarg, 0, sizeof(outarg)); + /* Directories have separate file-handle space */ + if (file && S_ISREG(inode->i_mode)) { + struct fuse_file *ff = file->private_data; + + inarg.getattr_flags |= FUSE_GETATTR_FH; + inarg.fh = ff->fh; + } + /* For now leave sync hints as the default, request all stats. */ + inarg.sx_flags = 0; + inarg.sx_mask = STATX_BASIC_STATS | STATX_BTIME; + args.opcode = FUSE_STATX; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); + if (err) + return err; + + sx = &outarg.stat; + if (((sx->mask & STATX_SIZE) && !fuse_valid_size(sx->size)) || + ((sx->mask & STATX_TYPE) && (!fuse_valid_type(sx->mode) || + inode_wrong_type(inode, sx->mode)))) { + make_bad_inode(inode); + return -EIO; + } + + fuse_statx_to_attr(&outarg.stat, &attr); + if ((sx->mask & STATX_BASIC_STATS) == STATX_BASIC_STATS) { + fuse_change_attributes(inode, &attr, &outarg.stat, + ATTR_TIMEOUT(&outarg), attr_version); + } + + if (stat) { + stat->result_mask = sx->mask & (STATX_BASIC_STATS | STATX_BTIME); + stat->btime.tv_sec = sx->btime.tv_sec; + stat->btime.tv_nsec = min_t(u32, sx->btime.tv_nsec, NSEC_PER_SEC - 1); + fuse_fillattr(inode, &attr, stat); + stat->result_mask |= STATX_TYPE; + } + + return 0; +} + static int fuse_do_getattr(struct inode *inode, struct kstat *stat, struct file *file) { @@ -1189,8 +1267,8 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, fuse_make_bad(inode); err = -EIO; } else { - fuse_change_attributes(inode, &outarg.attr, - attr_timeout(&outarg), + fuse_change_attributes(inode, &outarg.attr, NULL, + ATTR_TIMEOUT(&outarg), attr_version); if (stat) fuse_fillattr(inode, &outarg.attr, stat); @@ -1204,12 +1282,22 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); int err = 0; bool sync; u32 inval_mask = READ_ONCE(fi->inval_mask); u32 cache_mask = fuse_get_cache_mask(inode); - if (flags & AT_STATX_FORCE_SYNC) + + /* FUSE only supports basic stats and possibly btime */ + request_mask &= STATX_BASIC_STATS | STATX_BTIME; +retry: + if (fc->no_statx) + request_mask &= STATX_BASIC_STATS; + + if (!request_mask) + sync = false; + else if (flags & AT_STATX_FORCE_SYNC) sync = true; else if (flags & AT_STATX_DONT_SYNC) sync = false; @@ -1220,11 +1308,24 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, if (sync) { forget_all_cached_acls(inode); - err = fuse_do_getattr(inode, stat, file); + /* Try statx if BTIME is requested */ + if (!fc->no_statx && (request_mask & ~STATX_BASIC_STATS)) { + err = fuse_do_statx(inode, file, stat); + if (err == -ENOSYS) { + fc->no_statx = 1; + goto retry; + } + } else { + err = fuse_do_getattr(inode, stat, file); + } } else if (stat) { - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->mode = fi->orig_i_mode; stat->ino = fi->orig_ino; + if (test_bit(FUSE_I_BTIME, &fi->state)) { + stat->btime = fi->i_btime; + stat->result_mask |= STATX_BTIME; + } } return err; @@ -1715,8 +1816,8 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) inarg.mtimensec = inode->i_mtime.tv_nsec; if (fm->fc->minor >= 23) { inarg.valid |= FATTR_CTIME; - inarg.ctime = inode->i_ctime.tv_sec; - inarg.ctimensec = inode->i_ctime.tv_nsec; + inarg.ctime = inode_get_ctime(inode).tv_sec; + inarg.ctimensec = inode_get_ctime(inode).tv_nsec; } if (ff) { inarg.valid |= FATTR_FH; @@ -1857,12 +1958,12 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if (attr->ia_valid & ATTR_MTIME) inode->i_mtime = attr->ia_mtime; if (attr->ia_valid & ATTR_CTIME) - inode->i_ctime = attr->ia_ctime; + inode_set_ctime_to_ts(inode, attr->ia_ctime); /* FIXME: clear I_DIRTY_SYNC? */ } - fuse_change_attributes_common(inode, &outarg.attr, - attr_timeout(&outarg), + fuse_change_attributes_common(inode, &outarg.attr, NULL, + ATTR_TIMEOUT(&outarg), fuse_get_cache_mask(inode)); oldsize = inode->i_size; /* see the comment in fuse_change_attributes() */ diff --git a/fs/fuse/file.c b/fs/fuse/file.c index bc4115288eec..1cdb6327511e 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -19,7 +19,6 @@ #include <linux/uio.h> #include <linux/fs.h> #include <linux/filelock.h> -#include <linux/file.h> static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, @@ -479,36 +478,48 @@ static void fuse_sync_writes(struct inode *inode) fuse_release_nowrite(inode); } -struct fuse_flush_args { - struct fuse_args args; - struct fuse_flush_in inarg; - struct work_struct work; - struct file *file; -}; - -static int fuse_do_flush(struct fuse_flush_args *fa) +static int fuse_flush(struct file *file, fl_owner_t id) { - int err; - struct inode *inode = file_inode(fa->file); + struct inode *inode = file_inode(file); struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_file *ff = file->private_data; + struct fuse_flush_in inarg; + FUSE_ARGS(args); + int err; + + if (fuse_is_bad(inode)) + return -EIO; + + if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache) + return 0; err = write_inode_now(inode, 1); if (err) - goto out; + return err; inode_lock(inode); fuse_sync_writes(inode); inode_unlock(inode); - err = filemap_check_errors(fa->file->f_mapping); + err = filemap_check_errors(file->f_mapping); if (err) - goto out; + return err; err = 0; if (fm->fc->no_flush) goto inval_attr_out; - err = fuse_simple_request(fm, &fa->args); + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + inarg.lock_owner = fuse_lock_owner_id(fm->fc, id); + args.opcode = FUSE_FLUSH; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.force = true; + + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fm->fc->no_flush = 1; err = 0; @@ -521,57 +532,9 @@ inval_attr_out: */ if (!err && fm->fc->writeback_cache) fuse_invalidate_attr_mask(inode, STATX_BLOCKS); - -out: - fput(fa->file); - kfree(fa); return err; } -static void fuse_flush_async(struct work_struct *work) -{ - struct fuse_flush_args *fa = container_of(work, typeof(*fa), work); - - fuse_do_flush(fa); -} - -static int fuse_flush(struct file *file, fl_owner_t id) -{ - struct fuse_flush_args *fa; - struct inode *inode = file_inode(file); - struct fuse_mount *fm = get_fuse_mount(inode); - struct fuse_file *ff = file->private_data; - - if (fuse_is_bad(inode)) - return -EIO; - - if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache) - return 0; - - fa = kzalloc(sizeof(*fa), GFP_KERNEL); - if (!fa) - return -ENOMEM; - - fa->inarg.fh = ff->fh; - fa->inarg.lock_owner = fuse_lock_owner_id(fm->fc, id); - fa->args.opcode = FUSE_FLUSH; - fa->args.nodeid = get_node_id(inode); - fa->args.in_numargs = 1; - fa->args.in_args[0].size = sizeof(fa->inarg); - fa->args.in_args[0].value = &fa->inarg; - fa->args.force = true; - fa->file = get_file(file); - - /* Don't wait if the task is exiting */ - if (current->flags & PF_EXITING) { - INIT_WORK(&fa->work, fuse_flush_async); - schedule_work(&fa->work); - return 0; - } - - return fuse_do_flush(fa); -} - int fuse_fsync_common(struct file *file, loff_t start, loff_t end, int datasync, int opcode) { @@ -1465,7 +1428,8 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, int write = flags & FUSE_DIO_WRITE; int cuse = flags & FUSE_DIO_CUSE; struct file *file = io->iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fm->fc; size_t nmax = write ? fc->max_write : fc->max_read; @@ -1477,12 +1441,20 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, int err = 0; struct fuse_io_args *ia; unsigned int max_pages; + bool fopen_direct_io = ff->open_flags & FOPEN_DIRECT_IO; max_pages = iov_iter_npages(iter, fc->max_pages); ia = fuse_io_alloc(io, max_pages); if (!ia) return -ENOMEM; + if (fopen_direct_io && fc->direct_io_relax) { + res = filemap_write_and_wait_range(mapping, pos, pos + count - 1); + if (res) { + fuse_io_free(ia); + return res; + } + } if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { if (!write) inode_lock(inode); @@ -1491,6 +1463,14 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, inode_unlock(inode); } + if (fopen_direct_io && write) { + res = invalidate_inode_pages2_range(mapping, idx_from, idx_to); + if (res) { + fuse_io_free(ia); + return res; + } + } + io->should_dirty = !write && user_backed_iter(iter); while (count) { ssize_t nres; @@ -2478,14 +2458,17 @@ static const struct vm_operations_struct fuse_file_vm_ops = { static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fm->fc; /* DAX mmap is superior to direct_io mmap */ if (FUSE_IS_DAX(file_inode(file))) return fuse_dax_mmap(file, vma); if (ff->open_flags & FOPEN_DIRECT_IO) { - /* Can't provide the coherency needed for MAP_SHARED */ - if (vma->vm_flags & VM_MAYSHARE) + /* Can't provide the coherency needed for MAP_SHARED + * if FUSE_DIRECT_IO_RELAX isn't set. + */ + if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_relax) return -ENODEV; invalidate_inode_pages2(file->f_mapping); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 9b7fc7d3c7f1..bf0b85d0b95c 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -88,6 +88,9 @@ struct fuse_inode { preserve the original mode */ umode_t orig_i_mode; + /* Cache birthtime */ + struct timespec64 i_btime; + /** 64 bit inode number */ u64 orig_ino; @@ -167,6 +170,8 @@ enum { FUSE_I_SIZE_UNSTABLE, /* Bad inode */ FUSE_I_BAD, + /* Has btime */ + FUSE_I_BTIME, }; struct fuse_conn; @@ -792,6 +797,12 @@ struct fuse_conn { /* Is tmpfile not implemented by fs? */ unsigned int no_tmpfile:1; + /* relax restrictions in FOPEN_DIRECT_IO mode */ + unsigned int direct_io_relax:1; + + /* Is statx not implemented by fs? */ + unsigned int no_statx:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -1058,9 +1069,11 @@ void fuse_init_symlink(struct inode *inode); * Change attributes of an inode */ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, u64 attr_version); void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, u32 cache_mask); u32 fuse_get_cache_mask(struct inode *inode); @@ -1111,7 +1124,10 @@ void fuse_invalidate_entry_cache(struct dentry *entry); void fuse_invalidate_atime(struct inode *inode); -u64 entry_attr_timeout(struct fuse_entry_out *o); +u64 fuse_time_to_jiffies(u64 sec, u32 nsec); +#define ATTR_TIMEOUT(o) \ + fuse_time_to_jiffies((o)->attr_valid, (o)->attr_valid_nsec) + void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o); /** diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index f19d748890f0..2e4eb7cf26fb 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -77,7 +77,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) return NULL; fi->i_time = 0; - fi->inval_mask = 0; + fi->inval_mask = ~0; fi->nodeid = 0; fi->nlookup = 0; fi->attr_version = 0; @@ -163,6 +163,7 @@ static ino_t fuse_squash_ino(u64 ino64) } void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, u32 cache_mask) { struct fuse_conn *fc = get_fuse_conn(inode); @@ -172,7 +173,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, fi->attr_version = atomic64_inc_return(&fc->attr_version); fi->i_time = attr_valid; - WRITE_ONCE(fi->inval_mask, 0); + /* Clear basic stats from invalid mask */ + set_mask_bits(&fi->inval_mask, STATX_BASIC_STATS, 0); inode->i_ino = fuse_squash_ino(attr->ino); inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); @@ -194,8 +196,26 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_mtime.tv_nsec = attr->mtimensec; } if (!(cache_mask & STATX_CTIME)) { - inode->i_ctime.tv_sec = attr->ctime; - inode->i_ctime.tv_nsec = attr->ctimensec; + inode_set_ctime(inode, attr->ctime, attr->ctimensec); + } + if (sx) { + /* Sanitize nsecs */ + sx->btime.tv_nsec = + min_t(u32, sx->btime.tv_nsec, NSEC_PER_SEC - 1); + + /* + * Btime has been queried, cache is valid (whether or not btime + * is available or not) so clear STATX_BTIME from inval_mask. + * + * Availability of the btime attribute is indicated in + * FUSE_I_BTIME + */ + set_mask_bits(&fi->inval_mask, STATX_BTIME, 0); + if (sx->mask & STATX_BTIME) { + set_bit(FUSE_I_BTIME, &fi->state); + fi->i_btime.tv_sec = sx->btime.tv_sec; + fi->i_btime.tv_nsec = sx->btime.tv_nsec; + } } if (attr->blksize != 0) @@ -236,6 +256,7 @@ u32 fuse_get_cache_mask(struct inode *inode) } void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, u64 attr_version) { struct fuse_conn *fc = get_fuse_conn(inode); @@ -259,8 +280,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, attr->mtimensec = inode->i_mtime.tv_nsec; } if (cache_mask & STATX_CTIME) { - attr->ctime = inode->i_ctime.tv_sec; - attr->ctimensec = inode->i_ctime.tv_nsec; + attr->ctime = inode_get_ctime(inode).tv_sec; + attr->ctimensec = inode_get_ctime(inode).tv_nsec; } if ((attr_version != 0 && fi->attr_version > attr_version) || @@ -270,7 +291,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, } old_mtime = inode->i_mtime; - fuse_change_attributes_common(inode, attr, attr_valid, cache_mask); + fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask); oldsize = inode->i_size; /* @@ -318,8 +339,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr, inode->i_size = attr->size; inode->i_mtime.tv_sec = attr->mtime; inode->i_mtime.tv_nsec = attr->mtimensec; - inode->i_ctime.tv_sec = attr->ctime; - inode->i_ctime.tv_nsec = attr->ctimensec; + inode_set_ctime(inode, attr->ctime, attr->ctimensec); if (S_ISREG(inode->i_mode)) { fuse_init_common(inode); fuse_init_file_inode(inode, attr->flags); @@ -408,7 +428,7 @@ done: spin_lock(&fi->lock); fi->nlookup++; spin_unlock(&fi->lock); - fuse_change_attributes(inode, attr, attr_valid, attr_version); + fuse_change_attributes(inode, attr, NULL, attr_valid, attr_version); return inode; } @@ -1212,6 +1232,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->init_security = 1; if (flags & FUSE_CREATE_SUPP_GROUP) fc->create_supp_group = 1; + if (flags & FUSE_DIRECT_IO_RELAX) + fc->direct_io_relax = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1258,7 +1280,7 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | - FUSE_HAS_EXPIRE_ONLY; + FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_RELAX; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; @@ -1401,16 +1423,18 @@ EXPORT_SYMBOL_GPL(fuse_dev_free); static void fuse_fill_attr_from_inode(struct fuse_attr *attr, const struct fuse_inode *fi) { + struct timespec64 ctime = inode_get_ctime(&fi->inode); + *attr = (struct fuse_attr){ .ino = fi->inode.i_ino, .size = fi->inode.i_size, .blocks = fi->inode.i_blocks, .atime = fi->inode.i_atime.tv_sec, .mtime = fi->inode.i_mtime.tv_sec, - .ctime = fi->inode.i_ctime.tv_sec, + .ctime = ctime.tv_sec, .atimensec = fi->inode.i_atime.tv_nsec, .mtimensec = fi->inode.i_mtime.tv_nsec, - .ctimensec = fi->inode.i_ctime.tv_nsec, + .ctimensec = ctime.tv_nsec, .mode = fi->inode.i_mode, .nlink = fi->inode.i_nlink, .uid = fi->inode.i_uid.val, diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index dc603479b30e..9e6d587b3e67 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -223,8 +223,8 @@ retry: spin_unlock(&fi->lock); forget_all_cached_acls(inode); - fuse_change_attributes(inode, &o->attr, - entry_attr_timeout(o), + fuse_change_attributes(inode, &o->attr, NULL, + ATTR_TIMEOUT(o), attr_version); /* * The other branch comes via fuse_iget() @@ -232,7 +232,7 @@ retry: */ } else { inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, - &o->attr, entry_attr_timeout(o), + &o->attr, ATTR_TIMEOUT(o), attr_version); if (!inode) inode = ERR_PTR(-ENOMEM); @@ -243,8 +243,16 @@ retry: dput(dentry); dentry = alias; } - if (IS_ERR(dentry)) + if (IS_ERR(dentry)) { + if (!IS_ERR(inode)) { + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + fi->nlookup--; + spin_unlock(&fi->lock); + } return PTR_ERR(dentry); + } } if (fc->readdirplus_auto) set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index 03c966840422..be7f87a8e11a 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config GFS2_FS tristate "GFS2 file system support" + select BUFFER_HEAD select FS_POSIX_ACL select CRC32 select LIBCRC32C diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index a392aa0f041d..443640e6fb9c 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -142,7 +142,7 @@ int gfs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, ret = __gfs2_set_acl(inode, acl, type); if (!ret && mode != inode->i_mode) { - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode->i_mode = mode; mark_inode_dirty(inode); } diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index ae49256b7c8c..c26d48355cc2 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -183,13 +183,13 @@ static int gfs2_writepages(struct address_space *mapping, int ret; /* - * Even if we didn't write any pages here, we might still be holding + * Even if we didn't write enough pages here, we might still be holding * dirty pages in the ail. We forcibly flush the ail because we don't * want balance_dirty_pages() to loop indefinitely trying to write out * pages held in the ail that it can't find. */ ret = iomap_writepages(mapping, wbc, &wpc, &gfs2_writeback_ops); - if (ret == 0) + if (ret == 0 && wbc->nr_to_write > 0) set_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags); return ret; } @@ -272,8 +272,7 @@ continue_unlock: * not be suitable for data integrity * writeout). */ - *done_index = folio->index + - folio_nr_pages(folio); + *done_index = folio_next_index(folio); ret = 1; break; } @@ -747,7 +746,7 @@ static const struct address_space_operations gfs2_aops = { .writepages = gfs2_writepages, .read_folio = gfs2_read_folio, .readahead = gfs2_readahead, - .dirty_folio = filemap_dirty_folio, + .dirty_folio = iomap_dirty_folio, .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, .bmap = gfs2_bmap, diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 8d611fbcf0bd..ef7017fb6951 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -161,7 +161,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip) int error; down_write(&ip->i_rw_mutex); - page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS); + page = grab_cache_page(inode->i_mapping, 0); error = -ENOMEM; if (!page) goto out; @@ -971,7 +971,7 @@ gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len) if (status) return ERR_PTR(status); - folio = iomap_get_folio(iter, pos); + folio = iomap_get_folio(iter, pos, len); if (IS_ERR(folio)) gfs2_trans_end(sdp); return folio; @@ -1386,7 +1386,7 @@ static int trunc_start(struct inode *inode, u64 newsize) ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG; i_size_write(inode, newsize); - ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); + ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); gfs2_dinode_out(ip, dibh->b_data); if (journaled) @@ -1583,8 +1583,7 @@ out_unlock: /* Every transaction boundary, we rewrite the dinode to keep its di_blocks current in case of failure. */ - ip->i_inode.i_mtime = ip->i_inode.i_ctime = - current_time(&ip->i_inode); + ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -1950,7 +1949,7 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length) gfs2_statfs_change(sdp, 0, +btotal, 0); gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid, ip->i_inode.i_gid); - ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); + ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); up_write(&ip->i_rw_mutex); @@ -1993,7 +1992,7 @@ static int trunc_end(struct gfs2_inode *ip) gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); gfs2_ordered_del_inode(ip); } - ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); + ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG; gfs2_trans_add_meta(ip->i_gl, dibh); @@ -2094,7 +2093,7 @@ static int do_grow(struct inode *inode, u64 size) goto do_end_trans; truncate_setsize(inode, size); - ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); + ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 54a6d17b8c25..1a2afa88f8be 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -130,7 +130,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf, memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); if (ip->i_inode.i_size < offset + size) i_size_write(&ip->i_inode, offset + size); - ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); + ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -227,7 +227,7 @@ out: if (ip->i_inode.i_size < offset + copied) i_size_write(&ip->i_inode, offset + copied); - ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); + ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); @@ -1814,7 +1814,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, gfs2_inum_out(nip, dent); dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode)); dent->de_rahead = cpu_to_be16(gfs2_inode_ra_len(nip)); - tv = current_time(&ip->i_inode); + tv = inode_set_ctime_current(&ip->i_inode); if (ip->i_diskflags & GFS2_DIF_EXHASH) { leaf = (struct gfs2_leaf *)bh->b_data; be16_add_cpu(&leaf->lf_entries, 1); @@ -1825,7 +1825,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, da->bh = NULL; brelse(bh); ip->i_entries++; - ip->i_inode.i_mtime = ip->i_inode.i_ctime = tv; + ip->i_inode.i_mtime = tv; if (S_ISDIR(nip->i_inode.i_mode)) inc_nlink(&ip->i_inode); mark_inode_dirty(inode); @@ -1876,7 +1876,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) const struct qstr *name = &dentry->d_name; struct gfs2_dirent *dent, *prev = NULL; struct buffer_head *bh; - struct timespec64 tv = current_time(&dip->i_inode); + struct timespec64 tv; /* Returns _either_ the entry (if its first in block) or the previous entry otherwise */ @@ -1896,6 +1896,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) } dirent_del(dip, bh, prev, dent); + tv = inode_set_ctime_current(&dip->i_inode); if (dip->i_diskflags & GFS2_DIF_EXHASH) { struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data; u16 entries = be16_to_cpu(leaf->lf_entries); @@ -1910,7 +1911,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) if (!dip->i_entries) gfs2_consist_inode(dip); dip->i_entries--; - dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv; + dip->i_inode.i_mtime = tv; if (d_is_dir(dentry)) drop_nlink(&dip->i_inode); mark_inode_dirty(&dip->i_inode); @@ -1951,7 +1952,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, dent->de_type = cpu_to_be16(new_type); brelse(bh); - dip->i_inode.i_mtime = dip->i_inode.i_ctime = current_time(&dip->i_inode); + dip->i_inode.i_mtime = inode_set_ctime_current(&dip->i_inode); mark_inode_dirty_sync(&dip->i_inode); return 0; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index b43fa8b8fc05..f2700477a300 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -260,7 +260,7 @@ static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask) error = gfs2_meta_inode_buffer(ip, &bh); if (error) goto out_trans_end; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); gfs2_trans_add_meta(ip->i_gl, bh); ip->i_diskflags = new_flags; gfs2_dinode_out(ip, bh->b_data); @@ -432,7 +432,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); err = gfs2_glock_nq(&gh); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_uninit; } @@ -474,7 +474,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) err = gfs2_rindex_update(sdp); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_unlock; } @@ -482,12 +482,12 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) ap.target = data_blocks + ind_blocks; err = gfs2_quota_lock_check(ip, &ap); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_unlock; } err = gfs2_inplace_reserve(ip, &ap); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_quota_unlock; } @@ -500,7 +500,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) } err = gfs2_trans_begin(sdp, rblocks, 0); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_trans_fail; } @@ -508,7 +508,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) if (gfs2_is_stuffed(ip)) { err = gfs2_unstuff_dinode(ip); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_trans_end; } } @@ -524,7 +524,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) err = gfs2_allocate_page_backing(page, length); if (err) - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); out_page_locked: if (ret != VM_FAULT_LOCKED) @@ -558,7 +558,7 @@ static vm_fault_t gfs2_fault(struct vm_fault *vmf) gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); err = gfs2_glock_nq(&gh); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_uninit; } ret = filemap_fault(vmf); @@ -1436,17 +1436,14 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; - if (cmd == F_CANCELLK) { - /* Hack: */ - cmd = F_SETLK; - fl->fl_type = F_UNLCK; - } if (unlikely(gfs2_withdrawn(sdp))) { if (fl->fl_type == F_UNLCK) locks_lock_file_wait(file, fl); return -EIO; } - if (IS_GETLK(cmd)) + if (cmd == F_CANCELLK) + return dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl); + else if (IS_GETLK(cmd)) return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); else if (fl->fl_type == F_UNLCK) return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 1438e7465e30..4a280be229a6 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -176,7 +176,7 @@ void gfs2_glock_free(struct gfs2_glock *gl) wake_up_glock(gl); call_rcu(&gl->gl_rcu, gfs2_glock_dealloc); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) - wake_up(&sdp->sd_glock_wait); + wake_up(&sdp->sd_kill_wait); } /** @@ -468,10 +468,10 @@ done: * do_promote - promote as many requests as possible on the current queue * @gl: The glock * - * Returns: 1 if there is a blocked holder at the head of the list + * Returns true on success (i.e., progress was made or there are no waiters). */ -static int do_promote(struct gfs2_glock *gl) +static bool do_promote(struct gfs2_glock *gl) { struct gfs2_holder *gh, *current_gh; @@ -484,10 +484,10 @@ static int do_promote(struct gfs2_glock *gl) * If we get here, it means we may not grant this * holder for some reason. If this holder is at the * head of the list, it means we have a blocked holder - * at the head, so return 1. + * at the head, so return false. */ if (list_is_first(&gh->gh_list, &gl->gl_holders)) - return 1; + return false; do_error(gl, 0); break; } @@ -497,7 +497,7 @@ static int do_promote(struct gfs2_glock *gl) if (!current_gh) current_gh = gh; } - return 0; + return true; } /** @@ -591,10 +591,11 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) { /* move to back of queue and try next entry */ if (ret & LM_OUT_CANCELED) { - if ((gh->gh_flags & LM_FLAG_PRIORITY) == 0) - list_move_tail(&gh->gh_list, &gl->gl_holders); + list_move_tail(&gh->gh_list, &gl->gl_holders); gh = find_first_waiter(gl); gl->gl_target = gh->gh_state; + if (do_promote(gl)) + goto out; goto retry; } /* Some error or failed "try lock" - report it */ @@ -679,8 +680,7 @@ __acquires(&gl->gl_lockref.lock) gh && !(gh->gh_flags & LM_FLAG_NOEXP)) goto skip_inval; - lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | - LM_FLAG_PRIORITY); + lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP); GLOCK_BUG_ON(gl, gl->gl_state == target); GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target); if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) && @@ -834,7 +834,7 @@ __acquires(&gl->gl_lockref.lock) } else { if (test_bit(GLF_DEMOTE, &gl->gl_flags)) gfs2_demote_wake(gl); - if (do_promote(gl) == 0) + if (do_promote(gl)) goto out_unlock; gh = find_first_waiter(gl); gl->gl_target = gh->gh_state; @@ -1022,7 +1022,7 @@ static void delete_work_func(struct work_struct *work) * step entirely. */ if (gfs2_try_evict(gl)) { - if (test_bit(SDF_DEACTIVATING, &sdp->sd_flags)) + if (test_bit(SDF_KILL, &sdp->sd_flags)) goto out; if (gfs2_queue_verify_evict(gl)) return; @@ -1035,7 +1035,7 @@ static void delete_work_func(struct work_struct *work) GFS2_BLKST_UNLINKED); if (IS_ERR(inode)) { if (PTR_ERR(inode) == -EAGAIN && - !test_bit(SDF_DEACTIVATING, &sdp->sd_flags) && + !test_bit(SDF_KILL, &sdp->sd_flags) && gfs2_queue_verify_evict(gl)) return; } else { @@ -1231,7 +1231,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, out_free: gfs2_glock_dealloc(&gl->gl_rcu); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) - wake_up(&sdp->sd_glock_wait); + wake_up(&sdp->sd_kill_wait); out: return ret; @@ -1515,27 +1515,20 @@ fail: } if (test_bit(HIF_HOLDER, &gh2->gh_iflags)) continue; - if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt)) - insert_pt = &gh2->gh_list; } trace_gfs2_glock_queue(gh, 1); gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT); if (likely(insert_pt == NULL)) { list_add_tail(&gh->gh_list, &gl->gl_holders); - if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY)) - goto do_cancel; return; } list_add_tail(&gh->gh_list, insert_pt); -do_cancel: gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list); - if (!(gh->gh_flags & LM_FLAG_PRIORITY)) { - spin_unlock(&gl->gl_lockref.lock); - if (sdp->sd_lockstruct.ls_ops->lm_cancel) - sdp->sd_lockstruct.ls_ops->lm_cancel(gl); - spin_lock(&gl->gl_lockref.lock); - } + spin_unlock(&gl->gl_lockref.lock); + if (sdp->sd_lockstruct.ls_ops->lm_cancel) + sdp->sd_lockstruct.ls_ops->lm_cancel(gl); + spin_lock(&gl->gl_lockref.lock); return; trap_recursive: @@ -2017,7 +2010,9 @@ static long gfs2_scan_glock_lru(int nr) if (!test_bit(GLF_LOCK, &gl->gl_flags)) { if (!spin_trylock(&gl->gl_lockref.lock)) continue; - if (!gl->gl_lockref.count) { + if (gl->gl_lockref.count <= 1 && + (gl->gl_state == LM_ST_UNLOCKED || + demote_ok(gl))) { list_move(&gl->gl_lru, &dispose); atomic_dec(&lru_count); freed++; @@ -2195,7 +2190,7 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) flush_workqueue(glock_workqueue); glock_hash_walk(clear_glock, sdp); flush_workqueue(glock_workqueue); - wait_event_timeout(sdp->sd_glock_wait, + wait_event_timeout(sdp->sd_kill_wait, atomic_read(&sdp->sd_glock_disposal) == 0, HZ * 600); glock_hash_walk(dump_glock_func, sdp); @@ -2227,8 +2222,6 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags) *p++ = 'e'; if (flags & LM_FLAG_ANY) *p++ = 'A'; - if (flags & LM_FLAG_PRIORITY) - *p++ = 'p'; if (flags & LM_FLAG_NODE_SCOPE) *p++ = 'n'; if (flags & GL_ASYNC) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 1f1ba92c15a8..c8685ca7d2a2 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -68,14 +68,6 @@ enum { * also be granted in SHARED. The preferred state is whichever is compatible * with other granted locks, or the specified state if no other locks exist. * - * LM_FLAG_PRIORITY - * Override fairness considerations. Suppose a lock is held in a shared state - * and there is a pending request for the deferred state. A shared lock - * request with the priority flag would be allowed to bypass the deferred - * request and directly join the other shared lock. A shared lock request - * without the priority flag might be forced to wait until the deferred - * requested had acquired and released the lock. - * * LM_FLAG_NODE_SCOPE * This holder agrees to share the lock within this node. In other words, * the glock is held in EX mode according to DLM, but local holders on the @@ -86,7 +78,6 @@ enum { #define LM_FLAG_TRY_1CB 0x0002 #define LM_FLAG_NOEXP 0x0004 #define LM_FLAG_ANY 0x0008 -#define LM_FLAG_PRIORITY 0x0010 #define LM_FLAG_NODE_SCOPE 0x0020 #define GL_ASYNC 0x0040 #define GL_EXACT 0x0080 diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 54319328b16b..f41ca89d216b 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -437,8 +437,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) inode->i_atime = atime; inode->i_mtime.tv_sec = be64_to_cpu(str->di_mtime); inode->i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec); - inode->i_ctime.tv_sec = be64_to_cpu(str->di_ctime); - inode->i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec); + inode_set_ctime(inode, be64_to_cpu(str->di_ctime), + be32_to_cpu(str->di_ctime_nsec)); ip->i_goal = be64_to_cpu(str->di_goal_meta); ip->i_generation = be64_to_cpu(str->di_generation); @@ -567,15 +567,16 @@ static void freeze_go_callback(struct gfs2_glock *gl, bool remote) struct super_block *sb = sdp->sd_vfs; if (!remote || - gl->gl_state != LM_ST_SHARED || + (gl->gl_state != LM_ST_SHARED && + gl->gl_state != LM_ST_UNLOCKED) || gl->gl_demote_state != LM_ST_UNLOCKED) return; /* * Try to get an active super block reference to prevent racing with - * unmount (see trylock_super()). But note that unmount isn't the only - * place where a write lock on s_umount is taken, and we can fail here - * because of things like remount as well. + * unmount (see super_trylock_shared()). But note that unmount isn't + * the only place where a write lock on s_umount is taken, and we can + * fail here because of things like remount as well. */ if (down_read_trylock(&sb->s_umount)) { atomic_inc(&sb->s_active); @@ -637,7 +638,7 @@ static void iopen_go_callback(struct gfs2_glock *gl, bool remote) struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; if (!remote || sb_rdonly(sdp->sd_vfs) || - test_bit(SDF_DEACTIVATING, &sdp->sd_flags)) + test_bit(SDF_KILL, &sdp->sd_flags)) return; if (gl->gl_demote_state == LM_ST_UNLOCKED && diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 04f2d78e8658..a8c95c5293c6 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -452,7 +452,7 @@ struct gfs2_quota_data { s64 qd_change_sync; unsigned int qd_slot; - unsigned int qd_slot_count; + unsigned int qd_slot_ref; struct buffer_head *qd_bh; struct gfs2_quota_change *qd_bh_qc; @@ -537,6 +537,7 @@ struct gfs2_statfs_change_host { #define GFS2_QUOTA_OFF 0 #define GFS2_QUOTA_ACCOUNT 1 #define GFS2_QUOTA_ON 2 +#define GFS2_QUOTA_QUIET 3 /* on but not complaining */ #define GFS2_DATA_DEFAULT GFS2_DATA_ORDERED #define GFS2_DATA_WRITEBACK 1 @@ -606,7 +607,7 @@ enum { SDF_REMOTE_WITHDRAW = 13, /* Performing remote recovery */ SDF_WITHDRAW_RECOVERY = 14, /* Wait for journal recovery when we are withdrawing */ - SDF_DEACTIVATING = 15, + SDF_KILL = 15, SDF_EVICTING = 16, SDF_FROZEN = 17, }; @@ -716,7 +717,7 @@ struct gfs2_sbd { struct gfs2_glock *sd_rename_gl; struct gfs2_glock *sd_freeze_gl; struct work_struct sd_freeze_work; - wait_queue_head_t sd_glock_wait; + wait_queue_head_t sd_kill_wait; wait_queue_head_t sd_async_glock_wait; atomic_t sd_glock_disposal; struct completion sd_locking_init; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 17c994a0c0d0..0eac04507904 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -276,10 +276,16 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name) * gfs2_lookup_simple callers expect ENOENT * and do not check for NULL. */ - if (inode == NULL) - return ERR_PTR(-ENOENT); - else - return inode; + if (IS_ERR_OR_NULL(inode)) + return inode ? inode : ERR_PTR(-ENOENT); + + /* + * Must not call back into the filesystem when allocating + * pages in the metadata inode's address space. + */ + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + + return inode; } @@ -690,7 +696,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, set_nlink(inode, S_ISDIR(mode) ? 2 : 1); inode->i_rdev = dev; inode->i_size = size; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); munge_mode_uid_gid(dip, inode); check_and_update_goal(dip); ip->i_goal = dip->i_goal; @@ -1029,7 +1035,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, gfs2_trans_add_meta(ip->i_gl, dibh); inc_nlink(&ip->i_inode); - ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_ctime_current(&ip->i_inode); ihold(inode); d_instantiate(dentry, inode); mark_inode_dirty(inode); @@ -1114,7 +1120,7 @@ static int gfs2_unlink_inode(struct gfs2_inode *dip, return error; ip->i_entries = 0; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (S_ISDIR(inode->i_mode)) clear_nlink(inode); else @@ -1371,7 +1377,7 @@ static int update_moved_ino(struct gfs2_inode *ip, struct gfs2_inode *ndip, if (dir_rename) return gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR); - ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_ctime_current(&ip->i_inode); mark_inode_dirty_sync(&ip->i_inode); return 0; } @@ -2071,7 +2077,7 @@ static int gfs2_getattr(struct mnt_idmap *idmap, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (gfs2_holder_initialized(&gh)) gfs2_glock_dq_uninit(&gh); @@ -2139,8 +2145,7 @@ loff_t gfs2_seek_hole(struct file *file, loff_t offset) return vfs_setpos(file, ret, inode->i_sb->s_maxbytes); } -static int gfs2_update_time(struct inode *inode, struct timespec64 *time, - int flags) +static int gfs2_update_time(struct inode *inode, int flags) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_glock *gl = ip->i_gl; @@ -2155,7 +2160,8 @@ static int gfs2_update_time(struct inode *inode, struct timespec64 *time, if (error) return error; } - return generic_update_time(inode, time, flags); + generic_update_time(inode, flags); + return 0; } static const struct inode_operations gfs2_file_iops = { diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 54911294687c..59ab18c79889 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -222,11 +222,6 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, lkf |= DLM_LKF_NOQUEUEBAST; } - if (gfs_flags & LM_FLAG_PRIORITY) { - lkf |= DLM_LKF_NOORDER; - lkf |= DLM_LKF_HEADQUE; - } - if (gfs_flags & LM_FLAG_ANY) { if (req == DLM_LOCK_PR) lkf |= DLM_LKF_ALTCW; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index aa568796207c..e5271ae87d1c 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -1227,6 +1227,21 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) gfs2_log_unlock(sdp); } +static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) +{ + return atomic_read(&sdp->sd_log_pinned) + + atomic_read(&sdp->sd_log_blks_needed) >= + atomic_read(&sdp->sd_log_thresh1); +} + +static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) +{ + return sdp->sd_jdesc->jd_blocks - + atomic_read(&sdp->sd_log_blks_free) + + atomic_read(&sdp->sd_log_blks_needed) >= + atomic_read(&sdp->sd_log_thresh2); +} + /** * gfs2_log_commit - Commit a transaction to the log * @sdp: the filesystem @@ -1246,9 +1261,7 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { log_refund(sdp, tr); - if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) || - ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) > - atomic_read(&sdp->sd_log_thresh2))) + if (gfs2_ail_flush_reqd(sdp) || gfs2_jrnl_flush_reqd(sdp)) wake_up(&sdp->sd_logd_waitq); } @@ -1271,24 +1284,6 @@ static void gfs2_log_shutdown(struct gfs2_sbd *sdp) gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list)); } -static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) -{ - return (atomic_read(&sdp->sd_log_pinned) + - atomic_read(&sdp->sd_log_blks_needed) >= - atomic_read(&sdp->sd_log_thresh1)); -} - -static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) -{ - unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free); - - if (test_and_clear_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags)) - return 1; - - return used_blocks + atomic_read(&sdp->sd_log_blks_needed) >= - atomic_read(&sdp->sd_log_thresh2); -} - /** * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks * @data: Pointer to GFS2 superblock @@ -1301,14 +1296,11 @@ int gfs2_logd(void *data) { struct gfs2_sbd *sdp = data; unsigned long t = 1; - DEFINE_WAIT(wait); while (!kthread_should_stop()) { + if (gfs2_withdrawn(sdp)) + break; - if (gfs2_withdrawn(sdp)) { - msleep_interruptible(HZ); - continue; - } /* Check for errors writing to the journal */ if (sdp->sd_log_error) { gfs2_lm(sdp, @@ -1317,7 +1309,7 @@ int gfs2_logd(void *data) "prevent further damage.\n", sdp->sd_fsname, sdp->sd_log_error); gfs2_withdraw(sdp); - continue; + break; } if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { @@ -1326,7 +1318,9 @@ int gfs2_logd(void *data) GFS2_LFC_LOGD_JFLUSH_REQD); } - if (gfs2_ail_flush_reqd(sdp)) { + if (test_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags) || + gfs2_ail_flush_reqd(sdp)) { + clear_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags); gfs2_ail1_start(sdp); gfs2_ail1_wait(sdp); gfs2_ail1_empty(sdp, 0); @@ -1338,17 +1332,14 @@ int gfs2_logd(void *data) try_to_freeze(); - do { - prepare_to_wait(&sdp->sd_logd_waitq, &wait, - TASK_INTERRUPTIBLE); - if (!gfs2_ail_flush_reqd(sdp) && - !gfs2_jrnl_flush_reqd(sdp) && - !kthread_should_stop()) - t = schedule_timeout(t); - } while(t && !gfs2_ail_flush_reqd(sdp) && - !gfs2_jrnl_flush_reqd(sdp) && - !kthread_should_stop()); - finish_wait(&sdp->sd_logd_waitq, &wait); + t = wait_event_interruptible_timeout(sdp->sd_logd_waitq, + test_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags) || + gfs2_ail_flush_reqd(sdp) || + gfs2_jrnl_flush_reqd(sdp) || + sdp->sd_log_error || + gfs2_withdrawn(sdp) || + kthread_should_stop(), + t); } return 0; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 251322b01631..483f69807062 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -456,7 +456,7 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, * Find the folio with 'index' in the journal's mapping. Search the folio for * the journal head if requested (cleanup == false). Release refs on the * folio so the page cache can reclaim it. We grabbed a - * reference on this folio twice, first when we did a find_or_create_page() + * reference on this folio twice, first when we did a grab_cache_page() * to obtain the folio to add it to the bio and second when we do a * filemap_get_folio() here to get the folio to wait on while I/O on it is being * completed. @@ -481,7 +481,7 @@ static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index, if (!*done) *done = gfs2_jhead_pg_srch(jd, head, &folio->page); - /* filemap_get_folio() and the earlier find_or_create_page() */ + /* filemap_get_folio() and the earlier grab_cache_page() */ folio_put_refs(folio, 2); } @@ -535,8 +535,7 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, for (; block < je->lblock + je->blocks; block++, dblock++) { if (!page) { - page = find_or_create_page(mapping, - block >> shift, GFP_NOFS); + page = grab_cache_page(mapping, block >> shift); if (!page) { ret = -ENOMEM; done = true; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index afcb32854f14..66eb98b690a2 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -152,9 +152,9 @@ static int __init init_gfs2_fs(void) goto fail_shrinker; error = -ENOMEM; - gfs_recovery_wq = alloc_workqueue("gfs_recovery", + gfs2_recovery_wq = alloc_workqueue("gfs2_recovery", WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); - if (!gfs_recovery_wq) + if (!gfs2_recovery_wq) goto fail_wq1; gfs2_control_wq = alloc_workqueue("gfs2_control", @@ -162,7 +162,7 @@ static int __init init_gfs2_fs(void) if (!gfs2_control_wq) goto fail_wq2; - gfs2_freeze_wq = alloc_workqueue("freeze_workqueue", 0, 0); + gfs2_freeze_wq = alloc_workqueue("gfs2_freeze", 0, 0); if (!gfs2_freeze_wq) goto fail_wq3; @@ -194,7 +194,7 @@ fail_mempool: fail_wq3: destroy_workqueue(gfs2_control_wq); fail_wq2: - destroy_workqueue(gfs_recovery_wq); + destroy_workqueue(gfs2_recovery_wq); fail_wq1: unregister_shrinker(&gfs2_qd_shrinker); fail_shrinker: @@ -234,7 +234,7 @@ static void __exit exit_gfs2_fs(void) gfs2_unregister_debugfs(); unregister_filesystem(&gfs2_fs_type); unregister_filesystem(&gfs2meta_fs_type); - destroy_workqueue(gfs_recovery_wq); + destroy_workqueue(gfs2_recovery_wq); destroy_workqueue(gfs2_control_wq); destroy_workqueue(gfs2_freeze_wq); list_lru_destroy(&gfs2_qd_lru); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 8a27957dbfee..33ca04733e93 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -87,7 +87,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) set_bit(SDF_NOJOURNALID, &sdp->sd_flags); gfs2_tune_init(&sdp->sd_tune); - init_waitqueue_head(&sdp->sd_glock_wait); + init_waitqueue_head(&sdp->sd_kill_wait); init_waitqueue_head(&sdp->sd_async_glock_wait); atomic_set(&sdp->sd_glock_disposal, 0); init_completion(&sdp->sd_locking_init); @@ -1103,29 +1103,49 @@ static int init_threads(struct gfs2_sbd *sdp) struct task_struct *p; int error = 0; - p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); + p = kthread_create(gfs2_logd, sdp, "gfs2_logd/%s", sdp->sd_fsname); if (IS_ERR(p)) { error = PTR_ERR(p); - fs_err(sdp, "can't start logd thread: %d\n", error); + fs_err(sdp, "can't create logd thread: %d\n", error); return error; } + get_task_struct(p); sdp->sd_logd_process = p; - p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); + p = kthread_create(gfs2_quotad, sdp, "gfs2_quotad/%s", sdp->sd_fsname); if (IS_ERR(p)) { error = PTR_ERR(p); - fs_err(sdp, "can't start quotad thread: %d\n", error); + fs_err(sdp, "can't create quotad thread: %d\n", error); goto fail; } + get_task_struct(p); sdp->sd_quotad_process = p; + + wake_up_process(sdp->sd_logd_process); + wake_up_process(sdp->sd_quotad_process); return 0; fail: kthread_stop(sdp->sd_logd_process); + put_task_struct(sdp->sd_logd_process); sdp->sd_logd_process = NULL; return error; } +void gfs2_destroy_threads(struct gfs2_sbd *sdp) +{ + if (sdp->sd_logd_process) { + kthread_stop(sdp->sd_logd_process); + put_task_struct(sdp->sd_logd_process); + sdp->sd_logd_process = NULL; + } + if (sdp->sd_quotad_process) { + kthread_stop(sdp->sd_quotad_process); + put_task_struct(sdp->sd_quotad_process); + sdp->sd_quotad_process = NULL; + } +} + /** * gfs2_fill_super - Read in superblock * @sb: The VFS superblock @@ -1276,12 +1296,7 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) if (error) { gfs2_freeze_unlock(&sdp->sd_freeze_gh); - if (sdp->sd_quotad_process) - kthread_stop(sdp->sd_quotad_process); - sdp->sd_quotad_process = NULL; - if (sdp->sd_logd_process) - kthread_stop(sdp->sd_logd_process); - sdp->sd_logd_process = NULL; + gfs2_destroy_threads(sdp); fs_err(sdp, "can't make FS RW: %d\n", error); goto fail_per_node; } @@ -1381,6 +1396,7 @@ static const struct constant_table gfs2_param_quota[] = { {"off", GFS2_QUOTA_OFF}, {"account", GFS2_QUOTA_ACCOUNT}, {"on", GFS2_QUOTA_ON}, + {"quiet", GFS2_QUOTA_QUIET}, {} }; @@ -1786,9 +1802,9 @@ static void gfs2_kill_sb(struct super_block *sb) /* * Flush and then drain the delete workqueue here (via * destroy_workqueue()) to ensure that any delete work that - * may be running will also see the SDF_DEACTIVATING flag. + * may be running will also see the SDF_KILL flag. */ - set_bit(SDF_DEACTIVATING, &sdp->sd_flags); + set_bit(SDF_KILL, &sdp->sd_flags); gfs2_flush_delete_work(sdp); destroy_workqueue(sdp->sd_delete_wq); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 704192b73605..171b2713d2e5 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -109,38 +109,44 @@ static inline void spin_unlock_bucket(unsigned int hash) static void gfs2_qd_dealloc(struct rcu_head *rcu) { struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, qd_rcu); + struct gfs2_sbd *sdp = qd->qd_sbd; + kmem_cache_free(gfs2_quotad_cachep, qd); + if (atomic_dec_and_test(&sdp->sd_quota_count)) + wake_up(&sdp->sd_kill_wait); } -static void gfs2_qd_dispose(struct list_head *list) +static void gfs2_qd_dispose(struct gfs2_quota_data *qd) { - struct gfs2_quota_data *qd; - struct gfs2_sbd *sdp; - - while (!list_empty(list)) { - qd = list_first_entry(list, struct gfs2_quota_data, qd_lru); - sdp = qd->qd_gl->gl_name.ln_sbd; - - list_del(&qd->qd_lru); + struct gfs2_sbd *sdp = qd->qd_sbd; - /* Free from the filesystem-specific list */ - spin_lock(&qd_lock); - list_del(&qd->qd_list); - spin_unlock(&qd_lock); + spin_lock(&qd_lock); + list_del(&qd->qd_list); + spin_unlock(&qd_lock); - spin_lock_bucket(qd->qd_hash); - hlist_bl_del_rcu(&qd->qd_hlist); - spin_unlock_bucket(qd->qd_hash); + spin_lock_bucket(qd->qd_hash); + hlist_bl_del_rcu(&qd->qd_hlist); + spin_unlock_bucket(qd->qd_hash); + if (!gfs2_withdrawn(sdp)) { gfs2_assert_warn(sdp, !qd->qd_change); - gfs2_assert_warn(sdp, !qd->qd_slot_count); + gfs2_assert_warn(sdp, !qd->qd_slot_ref); gfs2_assert_warn(sdp, !qd->qd_bh_count); + } - gfs2_glock_put(qd->qd_gl); - atomic_dec(&sdp->sd_quota_count); + gfs2_glock_put(qd->qd_gl); + call_rcu(&qd->qd_rcu, gfs2_qd_dealloc); +} - /* Delete it from the common reclaim list */ - call_rcu(&qd->qd_rcu, gfs2_qd_dealloc); +static void gfs2_qd_list_dispose(struct list_head *list) +{ + struct gfs2_quota_data *qd; + + while (!list_empty(list)) { + qd = list_first_entry(list, struct gfs2_quota_data, qd_lru); + list_del(&qd->qd_lru); + + gfs2_qd_dispose(qd); } } @@ -149,18 +155,22 @@ static enum lru_status gfs2_qd_isolate(struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { struct list_head *dispose = arg; - struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru); + struct gfs2_quota_data *qd = + list_entry(item, struct gfs2_quota_data, qd_lru); + enum lru_status status; if (!spin_trylock(&qd->qd_lockref.lock)) return LRU_SKIP; + status = LRU_SKIP; if (qd->qd_lockref.count == 0) { lockref_mark_dead(&qd->qd_lockref); list_lru_isolate_move(lru, &qd->qd_lru, dispose); + status = LRU_REMOVED; } spin_unlock(&qd->qd_lockref.lock); - return LRU_REMOVED; + return status; } static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, @@ -175,7 +185,7 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, freed = list_lru_shrink_walk(&gfs2_qd_lru, sc, gfs2_qd_isolate, &dispose); - gfs2_qd_dispose(&dispose); + gfs2_qd_list_dispose(&dispose); return freed; } @@ -203,12 +213,7 @@ static u64 qd2index(struct gfs2_quota_data *qd) static u64 qd2offset(struct gfs2_quota_data *qd) { - u64 offset; - - offset = qd2index(qd); - offset *= sizeof(struct gfs2_quota); - - return offset; + return qd2index(qd) * sizeof(struct gfs2_quota); } static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, struct kqid qid) @@ -221,7 +226,7 @@ static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, str return NULL; qd->qd_sbd = sdp; - qd->qd_lockref.count = 1; + qd->qd_lockref.count = 0; spin_lock_init(&qd->qd_lockref.lock); qd->qd_id = qid; qd->qd_slot = -1; @@ -283,6 +288,7 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, spin_lock_bucket(hash); *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); if (qd == NULL) { + new_qd->qd_lockref.count++; *qdp = new_qd; list_add(&new_qd->qd_list, &sdp->sd_quota_list); hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]); @@ -302,20 +308,31 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, static void qd_hold(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd; + struct gfs2_sbd *sdp = qd->qd_sbd; gfs2_assert(sdp, !__lockref_is_dead(&qd->qd_lockref)); lockref_get(&qd->qd_lockref); } static void qd_put(struct gfs2_quota_data *qd) { + struct gfs2_sbd *sdp; + if (lockref_put_or_lock(&qd->qd_lockref)) return; + BUG_ON(__lockref_is_dead(&qd->qd_lockref)); + sdp = qd->qd_sbd; + if (unlikely(!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))) { + lockref_mark_dead(&qd->qd_lockref); + spin_unlock(&qd->qd_lockref.lock); + + gfs2_qd_dispose(qd); + return; + } + qd->qd_lockref.count = 0; list_lru_add(&gfs2_qd_lru, &qd->qd_lru); spin_unlock(&qd->qd_lockref.lock); - } static int slot_get(struct gfs2_quota_data *qd) @@ -325,20 +342,19 @@ static int slot_get(struct gfs2_quota_data *qd) int error = 0; spin_lock(&sdp->sd_bitmap_lock); - if (qd->qd_slot_count != 0) - goto out; - - error = -ENOSPC; - bit = find_first_zero_bit(sdp->sd_quota_bitmap, sdp->sd_quota_slots); - if (bit < sdp->sd_quota_slots) { + if (qd->qd_slot_ref == 0) { + bit = find_first_zero_bit(sdp->sd_quota_bitmap, + sdp->sd_quota_slots); + if (bit >= sdp->sd_quota_slots) { + error = -ENOSPC; + goto out; + } set_bit(bit, sdp->sd_quota_bitmap); qd->qd_slot = bit; - error = 0; -out: - qd->qd_slot_count++; } + qd->qd_slot_ref++; +out: spin_unlock(&sdp->sd_bitmap_lock); - return error; } @@ -347,8 +363,8 @@ static void slot_hold(struct gfs2_quota_data *qd) struct gfs2_sbd *sdp = qd->qd_sbd; spin_lock(&sdp->sd_bitmap_lock); - gfs2_assert(sdp, qd->qd_slot_count); - qd->qd_slot_count++; + gfs2_assert(sdp, qd->qd_slot_ref); + qd->qd_slot_ref++; spin_unlock(&sdp->sd_bitmap_lock); } @@ -357,8 +373,8 @@ static void slot_put(struct gfs2_quota_data *qd) struct gfs2_sbd *sdp = qd->qd_sbd; spin_lock(&sdp->sd_bitmap_lock); - gfs2_assert(sdp, qd->qd_slot_count); - if (!--qd->qd_slot_count) { + gfs2_assert(sdp, qd->qd_slot_ref); + if (!--qd->qd_slot_ref) { BUG_ON(!test_and_clear_bit(qd->qd_slot, sdp->sd_quota_bitmap)); qd->qd_slot = -1; } @@ -367,7 +383,7 @@ static void slot_put(struct gfs2_quota_data *qd) static int bh_get(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd; + struct gfs2_sbd *sdp = qd->qd_sbd; struct inode *inode = sdp->sd_qc_inode; struct gfs2_inode *ip = GFS2_I(inode); unsigned int block, offset; @@ -421,7 +437,7 @@ fail: static void bh_put(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd; + struct gfs2_sbd *sdp = qd->qd_sbd; mutex_lock(&sdp->sd_quota_mutex); gfs2_assert(sdp, qd->qd_bh_count); @@ -451,6 +467,20 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd, return 1; } +static int qd_bh_get_or_undo(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd) +{ + int error; + + error = bh_get(qd); + if (!error) + return 0; + + clear_bit(QDF_LOCKED, &qd->qd_flags); + slot_put(qd); + qd_put(qd); + return error; +} + static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) { struct gfs2_quota_data *qd = NULL, *iter; @@ -473,30 +503,29 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) spin_unlock(&qd_lock); if (qd) { - error = bh_get(qd); - if (error) { - clear_bit(QDF_LOCKED, &qd->qd_flags); - slot_put(qd); - qd_put(qd); + error = qd_bh_get_or_undo(sdp, qd); + if (error) return error; - } + *qdp = qd; } - *qdp = qd; - return 0; } -static void qd_unlock(struct gfs2_quota_data *qd) +static void qdsb_put(struct gfs2_quota_data *qd) { - gfs2_assert_warn(qd->qd_gl->gl_name.ln_sbd, - test_bit(QDF_LOCKED, &qd->qd_flags)); - clear_bit(QDF_LOCKED, &qd->qd_flags); bh_put(qd); slot_put(qd); qd_put(qd); } +static void qd_unlock(struct gfs2_quota_data *qd) +{ + gfs2_assert_warn(qd->qd_sbd, test_bit(QDF_LOCKED, &qd->qd_flags)); + clear_bit(QDF_LOCKED, &qd->qd_flags); + qdsb_put(qd); +} + static int qdsb_get(struct gfs2_sbd *sdp, struct kqid qid, struct gfs2_quota_data **qdp) { @@ -523,13 +552,6 @@ fail: return error; } -static void qdsb_put(struct gfs2_quota_data *qd) -{ - bh_put(qd); - slot_put(qd); - qd_put(qd); -} - /** * gfs2_qa_get - make sure we have a quota allocations data structure, * if necessary @@ -666,7 +688,7 @@ static int sort_qd(const void *a, const void *b) static void do_qc(struct gfs2_quota_data *qd, s64 change, int qc_type) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd; + struct gfs2_sbd *sdp = qd->qd_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); struct gfs2_quota_change *qc = qd->qd_bh_qc; s64 x; @@ -708,30 +730,29 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change, int qc_type) mutex_unlock(&sdp->sd_quota_mutex); } -static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, +static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index, unsigned off, void *buf, unsigned bytes) { + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct inode *inode = &ip->i_inode; - struct gfs2_sbd *sdp = GFS2_SB(inode); struct address_space *mapping = inode->i_mapping; struct page *page; struct buffer_head *bh; u64 blk; unsigned bsize = sdp->sd_sb.sb_bsize, bnum = 0, boff = 0; unsigned to_write = bytes, pg_off = off; - int done = 0; blk = index << (PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift); boff = off % bsize; - page = find_or_create_page(mapping, index, GFP_NOFS); + page = grab_cache_page(mapping, index); if (!page) return -ENOMEM; if (!page_has_buffers(page)) create_empty_buffers(page, bsize, 0); bh = page_buffers(page); - while (!done) { + for(;;) { /* Find the beginning block within the page */ if (pg_off >= ((bnum * bsize) + bsize)) { bh = bh->b_this_page; @@ -751,10 +772,7 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, set_buffer_uptodate(bh); if (bh_read(bh, REQ_META | REQ_PRIO) < 0) goto unlock_out; - if (gfs2_is_jdata(ip)) - gfs2_trans_add_data(ip->i_gl, bh); - else - gfs2_ordered_add_inode(ip); + gfs2_trans_add_data(ip->i_gl, bh); /* If we need to write to the next block as well */ if (to_write > (bsize - boff)) { @@ -763,7 +781,7 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, boff = pg_off % bsize; continue; } - done = 1; + break; } /* Write to the page, now that we have setup the buffer(s) */ @@ -780,12 +798,12 @@ unlock_out: return -EIO; } -static int gfs2_write_disk_quota(struct gfs2_inode *ip, struct gfs2_quota *qp, +static int gfs2_write_disk_quota(struct gfs2_sbd *sdp, struct gfs2_quota *qp, loff_t loc) { unsigned long pg_beg; unsigned pg_off, nbytes, overflow = 0; - int pg_oflow = 0, error; + int error; void *ptr; nbytes = sizeof(struct gfs2_quota); @@ -794,17 +812,15 @@ static int gfs2_write_disk_quota(struct gfs2_inode *ip, struct gfs2_quota *qp, pg_off = offset_in_page(loc); /* If the quota straddles a page boundary, split the write in two */ - if ((pg_off + nbytes) > PAGE_SIZE) { - pg_oflow = 1; + if ((pg_off + nbytes) > PAGE_SIZE) overflow = (pg_off + nbytes) - PAGE_SIZE; - } ptr = qp; - error = gfs2_write_buf_to_page(ip, pg_beg, pg_off, ptr, + error = gfs2_write_buf_to_page(sdp, pg_beg, pg_off, ptr, nbytes - overflow); /* If there's an overflow, write the remaining bytes to the next page */ - if (!error && pg_oflow) - error = gfs2_write_buf_to_page(ip, pg_beg + 1, 0, + if (!error && overflow) + error = gfs2_write_buf_to_page(sdp, pg_beg + 1, 0, ptr + nbytes - overflow, overflow); return error; @@ -812,7 +828,7 @@ static int gfs2_write_disk_quota(struct gfs2_inode *ip, struct gfs2_quota *qp, /** * gfs2_adjust_quota - adjust record of current block usage - * @ip: The quota inode + * @sdp: The superblock * @loc: Offset of the entry in the quota file * @change: The amount of usage change to record * @qd: The quota data @@ -824,12 +840,12 @@ static int gfs2_write_disk_quota(struct gfs2_inode *ip, struct gfs2_quota *qp, * Returns: 0 or -ve on error */ -static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, +static int gfs2_adjust_quota(struct gfs2_sbd *sdp, loff_t loc, s64 change, struct gfs2_quota_data *qd, struct qc_dqblk *fdq) { + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct inode *inode = &ip->i_inode; - struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_quota q; int err; u64 size; @@ -846,7 +862,6 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, return err; loc -= sizeof(q); /* gfs2_internal_read would've advanced the loc ptr */ - err = -EIO; be64_add_cpu(&q.qu_value, change); if (((s64)be64_to_cpu(q.qu_value)) < 0) q.qu_value = 0; /* Never go negative on quota usage */ @@ -866,12 +881,12 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, } } - err = gfs2_write_disk_quota(ip, &q, loc); + err = gfs2_write_disk_quota(sdp, &q, loc); if (!err) { size = loc + sizeof(struct gfs2_quota); if (size > inode->i_size) i_size_write(inode, size); - inode->i_mtime = inode->i_atime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); set_bit(QDF_REFRESH, &qd->qd_flags); } @@ -881,7 +896,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) { - struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_name.ln_sbd; + struct gfs2_sbd *sdp = (*qda)->qd_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct gfs2_alloc_parms ap = { .aflags = 0, }; unsigned int data_blocks, ind_blocks; @@ -893,18 +908,12 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) unsigned int nalloc = 0, blocks; int error; - error = gfs2_qa_get(ip); - if (error) - return error; - gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), &data_blocks, &ind_blocks); ghs = kmalloc_array(num_qd, sizeof(struct gfs2_holder), GFP_NOFS); - if (!ghs) { - error = -ENOMEM; - goto out; - } + if (!ghs) + return -ENOMEM; sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL); inode_lock(&ip->i_inode); @@ -953,7 +962,8 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) for (x = 0; x < num_qd; x++) { qd = qda[x]; offset = qd2offset(qd); - error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, qd, NULL); + error = gfs2_adjust_quota(sdp, offset, qd->qd_change_sync, qd, + NULL); if (error) goto out_end_trans; @@ -961,8 +971,6 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) set_bit(QDF_REFRESH, &qd->qd_flags); } - error = 0; - out_end_trans: gfs2_trans_end(sdp); out_ipres: @@ -976,8 +984,10 @@ out_dq: kfree(ghs); gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_DO_SYNC); -out: - gfs2_qa_put(ip); + if (!error) { + for (x = 0; x < num_qd; x++) + qda[x]->qd_sync_gen = sdp->sd_quota_sync_gen; + } return error; } @@ -1009,11 +1019,12 @@ static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd) static int do_glock(struct gfs2_quota_data *qd, int force_refresh, struct gfs2_holder *q_gh) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd; + struct gfs2_sbd *sdp = qd->qd_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct gfs2_holder i_gh; int error; + gfs2_assert_warn(sdp, sdp == qd->qd_gl->gl_name.ln_sbd); restart: error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh); if (error) @@ -1059,9 +1070,10 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qd; u32 x; - int error = 0; + int error; - if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) + if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON && + sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) return 0; error = gfs2_quota_hold(ip, uid, gid); @@ -1089,16 +1101,15 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) return error; } -static int need_sync(struct gfs2_quota_data *qd) +static bool need_sync(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd; + struct gfs2_sbd *sdp = qd->qd_sbd; struct gfs2_tune *gt = &sdp->sd_tune; s64 value; unsigned int num, den; - int do_sync = 1; if (!qd->qd_qb.qb_limit) - return 0; + return false; spin_lock(&qd_lock); value = qd->qd_change; @@ -1109,26 +1120,26 @@ static int need_sync(struct gfs2_quota_data *qd) den = gt->gt_quota_scale_den; spin_unlock(>->gt_spin); - if (value < 0) - do_sync = 0; + if (value <= 0) + return false; else if ((s64)be64_to_cpu(qd->qd_qb.qb_value) >= (s64)be64_to_cpu(qd->qd_qb.qb_limit)) - do_sync = 0; + return false; else { value *= gfs2_jindex_size(sdp) * num; value = div_s64(value, den); value += (s64)be64_to_cpu(qd->qd_qb.qb_value); if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit)) - do_sync = 0; + return false; } - return do_sync; + return true; } void gfs2_quota_unlock(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_quota_data *qda[4]; + struct gfs2_quota_data *qda[2 * GFS2_MAXQUOTAS]; unsigned int count = 0; u32 x; int found; @@ -1138,7 +1149,7 @@ void gfs2_quota_unlock(struct gfs2_inode *ip) for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { struct gfs2_quota_data *qd; - int sync; + bool sync; qd = ip->i_qadata->qa_qd[x]; sync = need_sync(qd); @@ -1154,15 +1165,8 @@ void gfs2_quota_unlock(struct gfs2_inode *ip) if (!found) continue; - gfs2_assert_warn(sdp, qd->qd_change_sync); - if (bh_get(qd)) { - clear_bit(QDF_LOCKED, &qd->qd_flags); - slot_put(qd); - qd_put(qd); - continue; - } - - qda[count++] = qd; + if (!qd_bh_get_or_undo(sdp, qd)) + qda[count++] = qd; } if (count) { @@ -1178,12 +1182,13 @@ void gfs2_quota_unlock(struct gfs2_inode *ip) static int print_message(struct gfs2_quota_data *qd, char *type) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd; + struct gfs2_sbd *sdp = qd->qd_sbd; - fs_info(sdp, "quota %s for %s %u\n", - type, - (qd->qd_id.type == USRQUOTA) ? "user" : "group", - from_kqid(&init_user_ns, qd->qd_id)); + if (sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) + fs_info(sdp, "quota %s for %s %u\n", + type, + (qd->qd_id.type == USRQUOTA) ? "user" : "group", + from_kqid(&init_user_ns, qd->qd_id)); return 0; } @@ -1269,7 +1274,8 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, u32 x; struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON || + if ((sdp->sd_args.ar_quota != GFS2_QUOTA_ON && + sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) || gfs2_assert_warn(sdp, change)) return; if (ip->i_diskflags & GFS2_DIF_SYSTEM) @@ -1288,6 +1294,24 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, } } +static bool qd_changed(struct gfs2_sbd *sdp) +{ + struct gfs2_quota_data *qd; + bool changed = false; + + spin_lock(&qd_lock); + list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { + if (test_bit(QDF_LOCKED, &qd->qd_flags) || + !test_bit(QDF_CHANGE, &qd->qd_flags)) + continue; + + changed = true; + break; + } + spin_unlock(&qd_lock); + return changed; +} + int gfs2_quota_sync(struct super_block *sb, int type) { struct gfs2_sbd *sdp = sb->s_fs_info; @@ -1297,6 +1321,9 @@ int gfs2_quota_sync(struct super_block *sb, int type) unsigned int x; int error = 0; + if (!qd_changed(sdp)) + return 0; + qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL); if (!qda) return -ENOMEM; @@ -1318,10 +1345,6 @@ int gfs2_quota_sync(struct super_block *sb, int type) if (num_qd) { if (!error) error = do_sync(num_qd, qda); - if (!error) - for (x = 0; x < num_qd; x++) - qda[x]->qd_sync_gen = - sdp->sd_quota_sync_gen; for (x = 0; x < num_qd; x++) qd_unlock(qda[x]); @@ -1423,7 +1446,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) set_bit(QDF_CHANGE, &qd->qd_flags); qd->qd_change = qc_change; qd->qd_slot = slot; - qd->qd_slot_count = 1; + qd->qd_slot_ref = 1; spin_lock(&qd_lock); BUG_ON(test_and_set_bit(slot, sdp->sd_quota_bitmap)); @@ -1455,36 +1478,35 @@ fail: void gfs2_quota_cleanup(struct gfs2_sbd *sdp) { - struct list_head *head = &sdp->sd_quota_list; struct gfs2_quota_data *qd; + LIST_HEAD(dispose); + int count; - spin_lock(&qd_lock); - while (!list_empty(head)) { - qd = list_last_entry(head, struct gfs2_quota_data, qd_list); + BUG_ON(test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)); - list_del(&qd->qd_list); + spin_lock(&qd_lock); + list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { + spin_lock(&qd->qd_lockref.lock); + if (qd->qd_lockref.count != 0) { + spin_unlock(&qd->qd_lockref.lock); + continue; + } + lockref_mark_dead(&qd->qd_lockref); + spin_unlock(&qd->qd_lockref.lock); - /* Also remove if this qd exists in the reclaim list */ list_lru_del(&gfs2_qd_lru, &qd->qd_lru); - atomic_dec(&sdp->sd_quota_count); - spin_unlock(&qd_lock); - - spin_lock_bucket(qd->qd_hash); - hlist_bl_del_rcu(&qd->qd_hlist); - spin_unlock_bucket(qd->qd_hash); - - gfs2_assert_warn(sdp, !qd->qd_change); - gfs2_assert_warn(sdp, !qd->qd_slot_count); - gfs2_assert_warn(sdp, !qd->qd_bh_count); - - gfs2_glock_put(qd->qd_gl); - call_rcu(&qd->qd_rcu, gfs2_qd_dealloc); - - spin_lock(&qd_lock); + list_add(&qd->qd_lru, &dispose); } spin_unlock(&qd_lock); - gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count)); + gfs2_qd_list_dispose(&dispose); + + wait_event_timeout(sdp->sd_kill_wait, + (count = atomic_read(&sdp->sd_quota_count)) == 0, + HZ * 60); + + if (count != 0) + fs_err(sdp, "%d left-over quota data objects\n", count); kvfree(sdp->sd_quota_bitmap); sdp->sd_quota_bitmap = NULL; @@ -1536,12 +1558,11 @@ int gfs2_quotad(void *data) unsigned long statfs_timeo = 0; unsigned long quotad_timeo = 0; unsigned long t = 0; - DEFINE_WAIT(wait); while (!kthread_should_stop()) { - if (gfs2_withdrawn(sdp)) - goto bypass; + break; + /* Update the master statfs file */ if (sdp->sd_statfs_force_sync) { int error = gfs2_statfs_sync(sdp->sd_vfs, 0); @@ -1559,15 +1580,16 @@ int gfs2_quotad(void *data) try_to_freeze(); -bypass: t = min(quotad_timeo, statfs_timeo); - prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE); - if (!sdp->sd_statfs_force_sync) - t -= schedule_timeout(t); - else + t = wait_event_interruptible_timeout(sdp->sd_quota_wait, + sdp->sd_statfs_force_sync || + gfs2_withdrawn(sdp) || + kthread_should_stop(), + t); + + if (sdp->sd_statfs_force_sync) t = 0; - finish_wait(&sdp->sd_quota_wait, &wait); } return 0; @@ -1580,6 +1602,8 @@ static int gfs2_quota_get_state(struct super_block *sb, struct qc_state *state) memset(state, 0, sizeof(*state)); switch (sdp->sd_args.ar_quota) { + case GFS2_QUOTA_QUIET: + fallthrough; case GFS2_QUOTA_ON: state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED; state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED; @@ -1726,7 +1750,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, goto out_release; /* Apply changes */ - error = gfs2_adjust_quota(ip, offset, 0, qd, fdq); + error = gfs2_adjust_quota(sdp, offset, 0, qd, fdq); if (!error) clear_bit(QDF_QMSG_QUIET, &qd->qd_flags); diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 21ada332d555..1429945215a0 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -50,7 +50,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (ret) return ret; - if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) + if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON && + sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) return 0; ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid, ap); if (ret) diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 9c7a9f640bad..5aae02669a40 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -27,7 +27,7 @@ #include "util.h" #include "dir.h" -struct workqueue_struct *gfs_recovery_wq; +struct workqueue_struct *gfs2_recovery_wq; int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, struct buffer_head **bh) @@ -570,7 +570,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) return -EBUSY; /* we have JDF_RECOVERY, queue should always succeed */ - rv = queue_work(gfs_recovery_wq, &jd->jd_work); + rv = queue_work(gfs2_recovery_wq, &jd->jd_work); BUG_ON(!rv); if (wait) diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index 0d30f8e804f4..7a0c9d0b7503 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -9,7 +9,7 @@ #include "incore.h" -extern struct workqueue_struct *gfs_recovery_wq; +extern struct workqueue_struct *gfs2_recovery_wq; static inline void gfs2_replay_incr_blk(struct gfs2_jdesc *jd, u32 *blk) { diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 9f4d5d6549ee..02d93da21b2b 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -412,7 +412,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(inode)); str->di_atime = cpu_to_be64(inode->i_atime.tv_sec); str->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec); - str->di_ctime = cpu_to_be64(inode->i_ctime.tv_sec); + str->di_ctime = cpu_to_be64(inode_get_ctime(inode).tv_sec); str->di_goal_meta = cpu_to_be64(ip->i_goal); str->di_goal_data = cpu_to_be64(ip->i_goal); @@ -429,7 +429,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_eattr = cpu_to_be64(ip->i_eattr); str->di_atime_nsec = cpu_to_be32(inode->i_atime.tv_nsec); str->di_mtime_nsec = cpu_to_be32(inode->i_mtime.tv_nsec); - str->di_ctime_nsec = cpu_to_be32(inode->i_ctime.tv_nsec); + str->di_ctime_nsec = cpu_to_be32(inode_get_ctime(inode).tv_nsec); } /** @@ -546,20 +546,10 @@ void gfs2_make_fs_ro(struct gfs2_sbd *sdp) { int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - if (!test_bit(SDF_DEACTIVATING, &sdp->sd_flags)) + if (!test_bit(SDF_KILL, &sdp->sd_flags)) gfs2_flush_delete_work(sdp); - if (!log_write_allowed && current == sdp->sd_quotad_process) - fs_warn(sdp, "The quotad daemon is withdrawing.\n"); - else if (sdp->sd_quotad_process) - kthread_stop(sdp->sd_quotad_process); - sdp->sd_quotad_process = NULL; - - if (!log_write_allowed && current == sdp->sd_logd_process) - fs_warn(sdp, "The logd daemon is withdrawing.\n"); - else if (sdp->sd_logd_process) - kthread_stop(sdp->sd_logd_process); - sdp->sd_logd_process = NULL; + gfs2_destroy_threads(sdp); if (log_write_allowed) { gfs2_quota_sync(sdp->sd_vfs, 0); @@ -580,15 +570,8 @@ void gfs2_make_fs_ro(struct gfs2_sbd *sdp) gfs2_log_is_empty(sdp), HZ * 5); gfs2_assert_warn(sdp, gfs2_log_is_empty(sdp)); - } else { - wait_event_timeout(sdp->sd_log_waitq, - gfs2_log_is_empty(sdp), - HZ * 5); } gfs2_quota_cleanup(sdp); - - if (!log_write_allowed) - sdp->sd_vfs->s_flags |= SB_RDONLY; } /** @@ -622,6 +605,10 @@ restart: if (!sb_rdonly(sb)) { gfs2_make_fs_ro(sdp); } + if (gfs2_withdrawn(sdp)) { + gfs2_destroy_threads(sdp); + gfs2_quota_cleanup(sdp); + } WARN_ON(gfs2_withdrawing(sdp)); /* At this point, we're through modifying the disk */ @@ -689,7 +676,7 @@ static int gfs2_freeze_locally(struct gfs2_sbd *sdp) struct super_block *sb = sdp->sd_vfs; int error; - error = freeze_super(sb); + error = freeze_super(sb, FREEZE_HOLDER_USERSPACE); if (error) return error; @@ -697,7 +684,9 @@ static int gfs2_freeze_locally(struct gfs2_sbd *sdp) gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE | GFS2_LFC_FREEZE_GO_SYNC); if (gfs2_withdrawn(sdp)) { - thaw_super(sb); + error = thaw_super(sb, FREEZE_HOLDER_USERSPACE); + if (error) + return error; return -EIO; } } @@ -712,7 +701,7 @@ static int gfs2_do_thaw(struct gfs2_sbd *sdp) error = gfs2_freeze_lock_shared(sdp); if (error) goto fail; - error = thaw_super(sb); + error = thaw_super(sb, FREEZE_HOLDER_USERSPACE); if (!error) return 0; @@ -761,7 +750,7 @@ out: * */ -static int gfs2_freeze_super(struct super_block *sb) +static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who) { struct gfs2_sbd *sdp = sb->s_fs_info; int error; @@ -816,7 +805,7 @@ out: * */ -static int gfs2_thaw_super(struct super_block *sb) +static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who) { struct gfs2_sbd *sdp = sb->s_fs_info; int error; @@ -1132,6 +1121,9 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) case GFS2_QUOTA_ON: state = "on"; break; + case GFS2_QUOTA_QUIET: + state = "quiet"; + break; default: state = "unknown"; break; diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index bba58629bc45..ab9c83106932 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -36,6 +36,7 @@ extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp); extern void gfs2_make_fs_ro(struct gfs2_sbd *sdp); extern void gfs2_online_uevent(struct gfs2_sbd *sdp); +extern void gfs2_destroy_threads(struct gfs2_sbd *sdp); extern int gfs2_statfs_init(struct gfs2_sbd *sdp); extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, s64 dinodes); diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 2dfbe2f188dd..60a0206890c5 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -98,7 +98,10 @@ static ssize_t status_show(struct gfs2_sbd *sdp, char *buf) "sd_log_flush_head: %d\n" "sd_log_flush_tail: %d\n" "sd_log_blks_reserved: %d\n" - "sd_log_revokes_available: %d\n", + "sd_log_revokes_available: %d\n" + "sd_log_pinned: %d\n" + "sd_log_thresh1: %d\n" + "sd_log_thresh2: %d\n", test_bit(SDF_JOURNAL_CHECKED, &f), test_bit(SDF_JOURNAL_LIVE, &f), (sdp->sd_jdesc ? sdp->sd_jdesc->jd_jid : 0), @@ -118,7 +121,7 @@ static ssize_t status_show(struct gfs2_sbd *sdp, char *buf) test_bit(SDF_WITHDRAW_IN_PROG, &f), test_bit(SDF_REMOTE_WITHDRAW, &f), test_bit(SDF_WITHDRAW_RECOVERY, &f), - test_bit(SDF_DEACTIVATING, &f), + test_bit(SDF_KILL, &f), sdp->sd_log_error, rwsem_is_locked(&sdp->sd_log_flush_lock), sdp->sd_log_num_revoke, @@ -128,7 +131,10 @@ static ssize_t status_show(struct gfs2_sbd *sdp, char *buf) sdp->sd_log_flush_head, sdp->sd_log_flush_tail, sdp->sd_log_blks_reserved, - atomic_read(&sdp->sd_log_revokes_available)); + atomic_read(&sdp->sd_log_revokes_available), + atomic_read(&sdp->sd_log_pinned), + atomic_read(&sdp->sd_log_thresh1), + atomic_read(&sdp->sd_log_thresh2)); return s; } @@ -168,10 +174,10 @@ static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) switch (n) { case 0: - error = thaw_super(sdp->sd_vfs); + error = thaw_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE); break; case 1: - error = freeze_super(sdp->sd_vfs); + error = freeze_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE); break; default: return -EINVAL; diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index dac22b1c1a2e..da29fafb6272 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -9,6 +9,7 @@ #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> +#include <linux/kthread.h> #include <linux/crc32.h> #include <linux/gfs2_ondisk.h> #include <linux/delay.h> @@ -150,7 +151,14 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) if (!sb_rdonly(sdp->sd_vfs)) { bool locked = mutex_trylock(&sdp->sd_freeze_mutex); - gfs2_make_fs_ro(sdp); + wake_up(&sdp->sd_logd_waitq); + wake_up(&sdp->sd_quota_wait); + + wait_event_timeout(sdp->sd_log_waitq, + gfs2_log_is_empty(sdp), + HZ * 5); + + sdp->sd_vfs->s_flags |= SB_RDONLY; if (locked) mutex_unlock(&sdp->sd_freeze_mutex); @@ -315,19 +323,19 @@ int gfs2_withdraw(struct gfs2_sbd *sdp) struct lm_lockstruct *ls = &sdp->sd_lockstruct; const struct lm_lockops *lm = ls->ls_ops; - if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW && - test_and_set_bit(SDF_WITHDRAWN, &sdp->sd_flags)) { - if (!test_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags)) - return -1; - - wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_IN_PROG, - TASK_UNINTERRUPTIBLE); - return -1; - } - - set_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags); - if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) { + unsigned long old = READ_ONCE(sdp->sd_flags), new; + + do { + if (old & BIT(SDF_WITHDRAWN)) { + wait_on_bit(&sdp->sd_flags, + SDF_WITHDRAW_IN_PROG, + TASK_UNINTERRUPTIBLE); + return -1; + } + new = old | BIT(SDF_WITHDRAWN) | BIT(SDF_WITHDRAW_IN_PROG); + } while (unlikely(!try_cmpxchg(&sdp->sd_flags, &old, new))); + fs_err(sdp, "about to withdraw this file system\n"); BUG_ON(sdp->sd_args.ar_debug); diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 93b36d026bb4..4fea70c0fe3d 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -311,7 +311,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, ea->ea_num_ptrs = 0; } - ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_ctime_current(&ip->i_inode); __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); gfs2_trans_end(sdp); @@ -763,7 +763,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (error) goto out_end_trans; - ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_ctime_current(&ip->i_inode); __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); out_end_trans: @@ -888,7 +888,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, if (es->es_el) ea_set_remove_stuffed(ip, es->es_el); - ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_ctime_current(&ip->i_inode); __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); gfs2_trans_end(GFS2_SB(&ip->i_inode)); @@ -1106,7 +1106,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) ea->ea_type = GFS2_EATYPE_UNUSED; } - ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_ctime_current(&ip->i_inode); __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); gfs2_trans_end(GFS2_SB(&ip->i_inode)); diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig index d985066006d5..5ea5cd8ecea9 100644 --- a/fs/hfs/Kconfig +++ b/fs/hfs/Kconfig @@ -2,6 +2,7 @@ config HFS_FS tristate "Apple Macintosh file system support" depends on BLOCK + select BUFFER_HEAD select NLS select LEGACY_DIRECT_IO help diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index d365bf0b8c77..632c226a3972 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -133,7 +133,7 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i goto err1; dir->i_size++; - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); hfs_find_exit(&fd); return 0; @@ -269,7 +269,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str) } dir->i_size--; - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); res = 0; out: @@ -337,7 +337,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name, if (err) goto out; dst_dir->i_size++; - dst_dir->i_mtime = dst_dir->i_ctime = current_time(dst_dir); + dst_dir->i_mtime = inode_set_ctime_current(dst_dir); mark_inode_dirty(dst_dir); /* finally remove the old entry */ @@ -349,7 +349,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name, if (err) goto out; src_dir->i_size--; - src_dir->i_mtime = src_dir->i_ctime = current_time(src_dir); + src_dir->i_mtime = inode_set_ctime_current(src_dir); mark_inode_dirty(src_dir); type = entry.type; diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 3e1e3dcf0b48..b75c26045df4 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -263,7 +263,7 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry) if (res) return res; clear_nlink(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); hfs_delete_inode(inode); mark_inode_dirty(inode); return 0; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 441d7fc952e3..ee349b72cfb3 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -200,7 +200,7 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); set_nlink(inode, 1); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); HFS_I(inode)->flags = 0; HFS_I(inode)->rsrc_inode = NULL; HFS_I(inode)->fs_blocks = 0; @@ -355,8 +355,8 @@ static int hfs_read_inode(struct inode *inode, void *data) inode->i_mode |= S_IWUGO; inode->i_mode &= ~hsb->s_file_umask; inode->i_mode |= S_IFREG; - inode->i_ctime = inode->i_atime = inode->i_mtime = - hfs_m_to_utime(rec->file.MdDat); + inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode, + hfs_m_to_utime(rec->file.MdDat)); inode->i_op = &hfs_file_inode_operations; inode->i_fop = &hfs_file_operations; inode->i_mapping->a_ops = &hfs_aops; @@ -366,8 +366,8 @@ static int hfs_read_inode(struct inode *inode, void *data) inode->i_size = be16_to_cpu(rec->dir.Val) + 2; HFS_I(inode)->fs_blocks = 0; inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask); - inode->i_ctime = inode->i_atime = inode->i_mtime = - hfs_m_to_utime(rec->dir.MdDat); + inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode, + hfs_m_to_utime(rec->dir.MdDat)); inode->i_op = &hfs_dir_inode_operations; inode->i_fop = &hfs_dir_operations; break; @@ -654,8 +654,7 @@ int hfs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, truncate_setsize(inode, attr->ia_size); hfs_file_truncate(inode); - inode->i_atime = inode->i_mtime = inode->i_ctime = - current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); } setattr_copy(&nop_mnt_idmap, inode, attr); diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c index 2875961fdc10..dc27d418fbcd 100644 --- a/fs/hfs/sysdep.c +++ b/fs/hfs/sysdep.c @@ -28,7 +28,9 @@ static int hfs_revalidate_dentry(struct dentry *dentry, unsigned int flags) /* fix up inode on a timezone change */ diff = sys_tz.tz_minuteswest * 60 - HFS_I(inode)->tz_secondswest; if (diff) { - inode->i_ctime.tv_sec += diff; + struct timespec64 ctime = inode_get_ctime(inode); + + inode_set_ctime(inode, ctime.tv_sec + diff, ctime.tv_nsec); inode->i_atime.tv_sec += diff; inode->i_mtime.tv_sec += diff; HFS_I(inode)->tz_secondswest += diff; diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig index 8034e7827a69..8ce4a33a9ac7 100644 --- a/fs/hfsplus/Kconfig +++ b/fs/hfsplus/Kconfig @@ -2,6 +2,7 @@ config HFSPLUS_FS tristate "Apple Extended HFS file system support" depends on BLOCK + select BUFFER_HEAD select NLS select NLS_UTF8 select LEGACY_DIRECT_IO diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 35472cba750e..e71ae2537eaa 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -312,7 +312,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, dir->i_size++; if (S_ISDIR(inode->i_mode)) hfsplus_subfolders_inc(dir); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY); hfs_find_exit(&fd); @@ -417,7 +417,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str) dir->i_size--; if (type == HFSPLUS_FOLDER) hfsplus_subfolders_dec(dir); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY); if (type == HFSPLUS_FILE || type == HFSPLUS_FOLDER) { @@ -494,7 +494,7 @@ int hfsplus_rename_cat(u32 cnid, dst_dir->i_size++; if (type == HFSPLUS_FOLDER) hfsplus_subfolders_inc(dst_dir); - dst_dir->i_mtime = dst_dir->i_ctime = current_time(dst_dir); + dst_dir->i_mtime = inode_set_ctime_current(dst_dir); /* finally remove the old entry */ err = hfsplus_cat_build_key(sb, src_fd.search_key, @@ -511,7 +511,7 @@ int hfsplus_rename_cat(u32 cnid, src_dir->i_size--; if (type == HFSPLUS_FOLDER) hfsplus_subfolders_dec(src_dir); - src_dir->i_mtime = src_dir->i_ctime = current_time(src_dir); + src_dir->i_mtime = inode_set_ctime_current(src_dir); /* remove old thread entry */ hfsplus_cat_build_key_with_cnid(sb, src_fd.search_key, cnid); diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 56fb5f1312e7..f5c4b3e31a1c 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -346,7 +346,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, inc_nlink(inode); hfsplus_instantiate(dst_dentry, inode, cnid); ihold(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); sbi->file_count++; hfsplus_mark_mdb_dirty(dst_dir->i_sb); @@ -405,7 +405,7 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) hfsplus_delete_inode(inode); } else sbi->file_count--; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); out: mutex_unlock(&sbi->vh_mutex); @@ -426,7 +426,7 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry) if (res) goto out; clear_nlink(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); hfsplus_delete_inode(inode); mark_inode_dirty(inode); out: diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 7a542f3dbe50..3c572e44f2ad 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -448,9 +448,9 @@ int hfsplus_file_extend(struct inode *inode, bool zeroout) if (sbi->alloc_file->i_size * 8 < sbi->total_blocks - sbi->free_blocks + 8) { /* extend alloc file */ - pr_err("extend alloc file! (%llu,%u,%u)\n", - sbi->alloc_file->i_size * 8, - sbi->total_blocks, sbi->free_blocks); + pr_err_ratelimited("extend alloc file! (%llu,%u,%u)\n", + sbi->alloc_file->i_size * 8, + sbi->total_blocks, sbi->free_blocks); return -ENOSPC; } diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 7d1a675e037d..c65c8c4b03dd 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -267,7 +267,7 @@ static int hfsplus_setattr(struct mnt_idmap *idmap, } truncate_setsize(inode, attr->ia_size); hfsplus_file_truncate(inode); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); } setattr_copy(&nop_mnt_idmap, inode, attr); @@ -298,7 +298,7 @@ int hfsplus_getattr(struct mnt_idmap *idmap, const struct path *path, stat->attributes_mask |= STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP; - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } @@ -392,7 +392,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, inode->i_ino = sbi->next_cnid++; inode_init_owner(&nop_mnt_idmap, inode, dir, mode); set_nlink(inode, 1); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); hip = HFSPLUS_I(inode); INIT_LIST_HEAD(&hip->open_dir_list); @@ -523,7 +523,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) inode->i_size = 2 + be32_to_cpu(folder->valence); inode->i_atime = hfsp_mt2ut(folder->access_date); inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); - inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date); + inode_set_ctime_to_ts(inode, + hfsp_mt2ut(folder->attribute_mod_date)); HFSPLUS_I(inode)->create_date = folder->create_date; HFSPLUS_I(inode)->fs_blocks = 0; if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) { @@ -564,7 +565,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) } inode->i_atime = hfsp_mt2ut(file->access_date); inode->i_mtime = hfsp_mt2ut(file->content_mod_date); - inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date); + inode_set_ctime_to_ts(inode, + hfsp_mt2ut(file->attribute_mod_date)); HFSPLUS_I(inode)->create_date = file->create_date; } else { pr_err("bad catalog entry used to create inode\n"); @@ -609,7 +611,7 @@ int hfsplus_cat_write_inode(struct inode *inode) hfsplus_cat_set_perms(inode, &folder->permissions); folder->access_date = hfsp_ut2mt(inode->i_atime); folder->content_mod_date = hfsp_ut2mt(inode->i_mtime); - folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); + folder->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode)); folder->valence = cpu_to_be32(inode->i_size - 2); if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) { folder->subfolders = @@ -644,7 +646,7 @@ int hfsplus_cat_write_inode(struct inode *inode) file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED); file->access_date = hfsp_ut2mt(inode->i_atime); file->content_mod_date = hfsp_ut2mt(inode->i_mtime); - file->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); + file->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode)); hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, sizeof(struct hfsplus_cat_file)); } @@ -700,7 +702,7 @@ int hfsplus_fileattr_set(struct mnt_idmap *idmap, else hip->userflags &= ~HFSPLUS_FLG_NODUMP; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); return 0; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 46387090eb76..dc5a5cea5fae 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -517,8 +517,7 @@ static int hostfs_inode_update(struct inode *ino, const struct hostfs_stat *st) (struct timespec64){ st->atime.tv_sec, st->atime.tv_nsec }; ino->i_mtime = (struct timespec64){ st->mtime.tv_sec, st->mtime.tv_nsec }; - ino->i_ctime = - (struct timespec64){ st->ctime.tv_sec, st->ctime.tv_nsec }; + inode_set_ctime(ino, st->ctime.tv_sec, st->ctime.tv_nsec); ino->i_size = st->size; ino->i_blocks = st->blocks; return 0; diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig index ec975f466877..ac1e9318e65a 100644 --- a/fs/hpfs/Kconfig +++ b/fs/hpfs/Kconfig @@ -2,6 +2,7 @@ config HPFS_FS tristate "OS/2 HPFS file system support" depends on BLOCK + select BUFFER_HEAD select FS_IOMAP help OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index f32f15669996..f36566d61215 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -277,10 +277,10 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned in * inode. */ - if (!result->i_ctime.tv_sec) { - if (!(result->i_ctime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->creation_date)))) - result->i_ctime.tv_sec = 1; - result->i_ctime.tv_nsec = 0; + if (!inode_get_ctime(result).tv_sec) { + time64_t csec = local_to_gmt(dir->i_sb, le32_to_cpu(de->creation_date)); + + inode_set_ctime(result, csec ? csec : 1, 0); result->i_mtime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->write_date)); result->i_mtime.tv_nsec = 0; result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->read_date)); diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index e50e92a42432..479166378bae 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -36,7 +36,7 @@ void hpfs_init_inode(struct inode *i) hpfs_inode->i_rddir_off = NULL; hpfs_inode->i_dirty = 0; - i->i_ctime.tv_sec = i->i_ctime.tv_nsec = 0; + inode_set_ctime(i, 0, 0); i->i_mtime.tv_sec = i->i_mtime.tv_nsec = 0; i->i_atime.tv_sec = i->i_atime.tv_nsec = 0; } @@ -232,7 +232,7 @@ void hpfs_write_inode_nolock(struct inode *i) if (de) { de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec)); de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec)); - de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_ctime.tv_sec)); + de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime(i).tv_sec)); de->read_only = !(i->i_mode & 0222); de->ea_size = cpu_to_le32(hpfs_inode->i_ea_size); hpfs_mark_4buffers_dirty(&qbh); @@ -242,7 +242,7 @@ void hpfs_write_inode_nolock(struct inode *i) if ((de = map_dirent(i, hpfs_inode->i_dno, "\001\001", 2, NULL, &qbh))) { de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec)); de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec)); - de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_ctime.tv_sec)); + de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime(i).tv_sec)); de->read_only = !(i->i_mode & 0222); de->ea_size = cpu_to_le32(/*hpfs_inode->i_ea_size*/0); de->file_size = cpu_to_le32(0); diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 69fb40b2c99a..f4eb8d6f5989 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -13,10 +13,9 @@ static void hpfs_update_directory_times(struct inode *dir) { time64_t t = local_to_gmt(dir->i_sb, local_get_seconds(dir->i_sb)); if (t == dir->i_mtime.tv_sec && - t == dir->i_ctime.tv_sec) + t == inode_get_ctime(dir).tv_sec) return; - dir->i_mtime.tv_sec = dir->i_ctime.tv_sec = t; - dir->i_mtime.tv_nsec = dir->i_ctime.tv_nsec = 0; + dir->i_mtime = inode_set_ctime(dir, t, 0); hpfs_write_inode_nolock(dir); } @@ -59,10 +58,8 @@ static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, result->i_ino = fno; hpfs_i(result)->i_parent_dir = dir->i_ino; hpfs_i(result)->i_dno = dno; - result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); - result->i_ctime.tv_nsec = 0; - result->i_mtime.tv_nsec = 0; - result->i_atime.tv_nsec = 0; + result->i_mtime = result->i_atime = + inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0); hpfs_i(result)->i_ea_size = 0; result->i_mode |= S_IFDIR; result->i_op = &hpfs_dir_iops; @@ -167,10 +164,8 @@ static int hpfs_create(struct mnt_idmap *idmap, struct inode *dir, result->i_fop = &hpfs_file_ops; set_nlink(result, 1); hpfs_i(result)->i_parent_dir = dir->i_ino; - result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); - result->i_ctime.tv_nsec = 0; - result->i_mtime.tv_nsec = 0; - result->i_atime.tv_nsec = 0; + result->i_mtime = result->i_atime = + inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0); hpfs_i(result)->i_ea_size = 0; if (dee.read_only) result->i_mode &= ~0222; @@ -250,10 +245,8 @@ static int hpfs_mknod(struct mnt_idmap *idmap, struct inode *dir, hpfs_init_inode(result); result->i_ino = fno; hpfs_i(result)->i_parent_dir = dir->i_ino; - result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); - result->i_ctime.tv_nsec = 0; - result->i_mtime.tv_nsec = 0; - result->i_atime.tv_nsec = 0; + result->i_mtime = result->i_atime = + inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0); hpfs_i(result)->i_ea_size = 0; result->i_uid = current_fsuid(); result->i_gid = current_fsgid(); @@ -326,10 +319,8 @@ static int hpfs_symlink(struct mnt_idmap *idmap, struct inode *dir, result->i_ino = fno; hpfs_init_inode(result); hpfs_i(result)->i_parent_dir = dir->i_ino; - result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); - result->i_ctime.tv_nsec = 0; - result->i_mtime.tv_nsec = 0; - result->i_atime.tv_nsec = 0; + result->i_mtime = result->i_atime = + inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0); hpfs_i(result)->i_ea_size = 0; result->i_mode = S_IFLNK | 0777; result->i_uid = current_fsuid(); diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 1cb89595b875..758a51564124 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -729,8 +729,9 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) root->i_atime.tv_nsec = 0; root->i_mtime.tv_sec = local_to_gmt(s, le32_to_cpu(de->write_date)); root->i_mtime.tv_nsec = 0; - root->i_ctime.tv_sec = local_to_gmt(s, le32_to_cpu(de->creation_date)); - root->i_ctime.tv_nsec = 0; + inode_set_ctime(root, + local_to_gmt(s, le32_to_cpu(de->creation_date)), + 0); hpfs_i(root)->i_ea_size = le32_to_cpu(de->ea_size); hpfs_i(root)->i_parent_dir = root->i_ino; if (root->i_size == -1) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 7b17ccfa039d..316c4cebd3f3 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -283,6 +283,41 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, #endif /* + * Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset. + * Returns the maximum number of bytes one can read without touching the 1st raw + * HWPOISON subpage. + * + * The implementation borrows the iteration logic from copy_page_to_iter*. + */ +static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t bytes) +{ + size_t n = 0; + size_t res = 0; + + /* First subpage to start the loop. */ + page += offset / PAGE_SIZE; + offset %= PAGE_SIZE; + while (1) { + if (is_raw_hwpoison_page_in_hugepage(page)) + break; + + /* Safe to read n bytes without touching HWPOISON subpage. */ + n = min(bytes, (size_t)PAGE_SIZE - offset); + res += n; + bytes -= n; + if (!bytes || !n) + break; + offset += n; + if (offset == PAGE_SIZE) { + page++; + offset = 0; + } + } + + return res; +} + +/* * Support for read() - Find the page attached to f_mapping and copy out the * data. This provides functionality similar to filemap_read(). */ @@ -300,7 +335,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) while (iov_iter_count(to)) { struct page *page; - size_t nr, copied; + size_t nr, copied, want; /* nr is the maximum number of bytes to copy from this page */ nr = huge_page_size(h); @@ -328,16 +363,26 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) } else { unlock_page(page); - if (PageHWPoison(page)) { - put_page(page); - retval = -EIO; - break; + if (!PageHWPoison(page)) + want = nr; + else { + /* + * Adjust how many bytes safe to read without + * touching the 1st raw HWPOISON subpage after + * offset. + */ + want = adjust_range_hwpoison(page, offset, nr); + if (want == 0) { + put_page(page); + retval = -EIO; + break; + } } /* * We have the page, copy it to user space buffer. */ - copied = copy_page_to_iter(page, offset, nr, to); + copied = copy_page_to_iter(page, offset, want, to); put_page(page); } offset += copied; @@ -887,7 +932,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) i_size_write(inode, offset + len); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); out: inode_unlock(inode); return error; @@ -935,7 +980,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, inode->i_mode = S_IFDIR | ctx->mode; inode->i_uid = ctx->uid; inode->i_gid = ctx->gid; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); inode->i_op = &hugetlbfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ @@ -979,7 +1024,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); inode->i_mapping->private_data = resv_map; info->seals = F_SEAL_SEAL; switch (mode & S_IFMT) { @@ -1022,7 +1067,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); if (!inode) return -ENOSPC; - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); d_instantiate(dentry, inode); dget(dentry);/* Extra count - pin the dentry in core */ return 0; @@ -1054,7 +1099,7 @@ static int hugetlbfs_tmpfile(struct mnt_idmap *idmap, inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0); if (!inode) return -ENOSPC; - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); d_tmpfile(file, inode); return finish_open_simple(file, 0); } @@ -1076,7 +1121,7 @@ static int hugetlbfs_symlink(struct mnt_idmap *idmap, } else iput(inode); } - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); return error; } diff --git a/fs/inode.c b/fs/inode.c index 67611a360031..84bc3c76e5cc 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -751,16 +751,11 @@ EXPORT_SYMBOL_GPL(evict_inodes); /** * invalidate_inodes - attempt to free all inodes on a superblock * @sb: superblock to operate on - * @kill_dirty: flag to guide handling of dirty inodes * - * Attempts to free all inodes for a given superblock. If there were any - * busy inodes return a non-zero value, else zero. - * If @kill_dirty is set, discard dirty inodes too, otherwise treat - * them as busy. + * Attempts to free all inodes (including dirty inodes) for a given superblock. */ -int invalidate_inodes(struct super_block *sb, bool kill_dirty) +void invalidate_inodes(struct super_block *sb) { - int busy = 0; struct inode *inode, *next; LIST_HEAD(dispose); @@ -772,14 +767,8 @@ again: spin_unlock(&inode->i_lock); continue; } - if (inode->i_state & I_DIRTY_ALL && !kill_dirty) { - spin_unlock(&inode->i_lock); - busy = 1; - continue; - } if (atomic_read(&inode->i_count)) { spin_unlock(&inode->i_lock); - busy = 1; continue; } @@ -797,8 +786,6 @@ again: spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); - - return busy; } /* @@ -1850,6 +1837,7 @@ EXPORT_SYMBOL(bmap); static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, struct timespec64 now) { + struct timespec64 ctime; if (!(mnt->mnt_flags & MNT_RELATIME)) return 1; @@ -1861,7 +1849,8 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, /* * Is ctime younger than or equal to atime? If yes, update atime: */ - if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0) + ctime = inode_get_ctime(inode); + if (timespec64_compare(&ctime, &inode->i_atime) >= 0) return 1; /* @@ -1876,29 +1865,76 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, return 0; } -int generic_update_time(struct inode *inode, struct timespec64 *time, int flags) +/** + * inode_update_timestamps - update the timestamps on the inode + * @inode: inode to be updated + * @flags: S_* flags that needed to be updated + * + * The update_time function is called when an inode's timestamps need to be + * updated for a read or write operation. This function handles updating the + * actual timestamps. It's up to the caller to ensure that the inode is marked + * dirty appropriately. + * + * In the case where any of S_MTIME, S_CTIME, or S_VERSION need to be updated, + * attempt to update all three of them. S_ATIME updates can be handled + * independently of the rest. + * + * Returns a set of S_* flags indicating which values changed. + */ +int inode_update_timestamps(struct inode *inode, int flags) { - int dirty_flags = 0; + int updated = 0; + struct timespec64 now; + + if (flags & (S_MTIME|S_CTIME|S_VERSION)) { + struct timespec64 ctime = inode_get_ctime(inode); - if (flags & (S_ATIME | S_CTIME | S_MTIME)) { - if (flags & S_ATIME) - inode->i_atime = *time; - if (flags & S_CTIME) - inode->i_ctime = *time; - if (flags & S_MTIME) - inode->i_mtime = *time; - - if (inode->i_sb->s_flags & SB_LAZYTIME) - dirty_flags |= I_DIRTY_TIME; - else - dirty_flags |= I_DIRTY_SYNC; + now = inode_set_ctime_current(inode); + if (!timespec64_equal(&now, &ctime)) + updated |= S_CTIME; + if (!timespec64_equal(&now, &inode->i_mtime)) { + inode->i_mtime = now; + updated |= S_MTIME; + } + if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, updated)) + updated |= S_VERSION; + } else { + now = current_time(inode); } - if ((flags & S_VERSION) && inode_maybe_inc_iversion(inode, false)) - dirty_flags |= I_DIRTY_SYNC; + if (flags & S_ATIME) { + if (!timespec64_equal(&now, &inode->i_atime)) { + inode->i_atime = now; + updated |= S_ATIME; + } + } + return updated; +} +EXPORT_SYMBOL(inode_update_timestamps); +/** + * generic_update_time - update the timestamps on the inode + * @inode: inode to be updated + * @flags: S_* flags that needed to be updated + * + * The update_time function is called when an inode's timestamps need to be + * updated for a read or write operation. In the case where any of S_MTIME, S_CTIME, + * or S_VERSION need to be updated we attempt to update all three of them. S_ATIME + * updates can be handled done independently of the rest. + * + * Returns a S_* mask indicating which fields were updated. + */ +int generic_update_time(struct inode *inode, int flags) +{ + int updated = inode_update_timestamps(inode, flags); + int dirty_flags = 0; + + if (updated & (S_ATIME|S_MTIME|S_CTIME)) + dirty_flags = inode->i_sb->s_flags & SB_LAZYTIME ? I_DIRTY_TIME : I_DIRTY_SYNC; + if (updated & S_VERSION) + dirty_flags |= I_DIRTY_SYNC; __mark_inode_dirty(inode, dirty_flags); - return 0; + return updated; } EXPORT_SYMBOL(generic_update_time); @@ -1906,11 +1942,12 @@ EXPORT_SYMBOL(generic_update_time); * This does the actual work of updating an inodes time or version. Must have * had called mnt_want_write() before calling this. */ -int inode_update_time(struct inode *inode, struct timespec64 *time, int flags) +int inode_update_time(struct inode *inode, int flags) { if (inode->i_op->update_time) - return inode->i_op->update_time(inode, time, flags); - return generic_update_time(inode, time, flags); + return inode->i_op->update_time(inode, flags); + generic_update_time(inode, flags); + return 0; } EXPORT_SYMBOL(inode_update_time); @@ -1962,7 +1999,6 @@ void touch_atime(const struct path *path) { struct vfsmount *mnt = path->mnt; struct inode *inode = d_inode(path->dentry); - struct timespec64 now; if (!atime_needs_update(path, inode)) return; @@ -1981,8 +2017,7 @@ void touch_atime(const struct path *path) * We may also fail on filesystems that have the ability to make parts * of the fs read only, e.g. subvolumes in Btrfs. */ - now = current_time(inode); - inode_update_time(inode, &now, S_ATIME); + inode_update_time(inode, S_ATIME); __mnt_drop_write(mnt); skip_update: sb_end_write(inode->i_sb); @@ -2067,18 +2102,21 @@ int file_remove_privs(struct file *file) } EXPORT_SYMBOL(file_remove_privs); -static int inode_needs_update_time(struct inode *inode, struct timespec64 *now) +static int inode_needs_update_time(struct inode *inode) { int sync_it = 0; + struct timespec64 now = current_time(inode); + struct timespec64 ctime; /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) return 0; - if (!timespec64_equal(&inode->i_mtime, now)) + if (!timespec64_equal(&inode->i_mtime, &now)) sync_it = S_MTIME; - if (!timespec64_equal(&inode->i_ctime, now)) + ctime = inode_get_ctime(inode); + if (!timespec64_equal(&ctime, &now)) sync_it |= S_CTIME; if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) @@ -2087,15 +2125,14 @@ static int inode_needs_update_time(struct inode *inode, struct timespec64 *now) return sync_it; } -static int __file_update_time(struct file *file, struct timespec64 *now, - int sync_mode) +static int __file_update_time(struct file *file, int sync_mode) { int ret = 0; struct inode *inode = file_inode(file); /* try to update time settings */ if (!__mnt_want_write_file(file)) { - ret = inode_update_time(inode, now, sync_mode); + ret = inode_update_time(inode, sync_mode); __mnt_drop_write_file(file); } @@ -2120,13 +2157,12 @@ int file_update_time(struct file *file) { int ret; struct inode *inode = file_inode(file); - struct timespec64 now = current_time(inode); - ret = inode_needs_update_time(inode, &now); + ret = inode_needs_update_time(inode); if (ret <= 0) return ret; - return __file_update_time(file, &now, ret); + return __file_update_time(file, ret); } EXPORT_SYMBOL(file_update_time); @@ -2149,7 +2185,6 @@ static int file_modified_flags(struct file *file, int flags) { int ret; struct inode *inode = file_inode(file); - struct timespec64 now = current_time(inode); /* * Clear the security bits if the process is not being run by root. @@ -2162,13 +2197,13 @@ static int file_modified_flags(struct file *file, int flags) if (unlikely(file->f_mode & FMODE_NOCMTIME)) return 0; - ret = inode_needs_update_time(inode, &now); + ret = inode_needs_update_time(inode); if (ret <= 0) return ret; if (flags & IOCB_NOWAIT) return -EAGAIN; - return __file_update_time(file, &now, ret); + return __file_update_time(file, ret); } /** @@ -2488,17 +2523,27 @@ struct timespec64 current_time(struct inode *inode) struct timespec64 now; ktime_get_coarse_real_ts64(&now); - - if (unlikely(!inode->i_sb)) { - WARN(1, "current_time() called with uninitialized super_block in the inode"); - return now; - } - return timestamp_truncate(now, inode); } EXPORT_SYMBOL(current_time); /** + * inode_set_ctime_current - set the ctime to current_time + * @inode: inode + * + * Set the inode->i_ctime to the current value for the inode. Returns + * the current value that was assigned to i_ctime. + */ +struct timespec64 inode_set_ctime_current(struct inode *inode) +{ + struct timespec64 now = current_time(inode); + + inode_set_ctime(inode, now.tv_sec, now.tv_nsec); + return now; +} +EXPORT_SYMBOL(inode_set_ctime_current); + +/** * in_group_or_capable - check whether caller is CAP_FSETID privileged * @idmap: idmap of the mount @inode was found from * @inode: inode to check diff --git a/fs/internal.h b/fs/internal.h index f7a3dc111026..d64ae03998cc 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -23,16 +23,10 @@ struct mnt_idmap; */ #ifdef CONFIG_BLOCK extern void __init bdev_cache_init(void); - -void emergency_thaw_bdev(struct super_block *sb); #else static inline void bdev_cache_init(void) { } -static inline int emergency_thaw_bdev(struct super_block *sb) -{ - return 0; -} #endif /* CONFIG_BLOCK */ /* @@ -115,7 +109,7 @@ static inline void put_file_access(struct file *file) * super.c */ extern int reconfigure_super(struct fs_context *); -extern bool trylock_super(struct super_block *sb); +extern bool super_trylock_shared(struct super_block *sb); struct super_block *user_get_super(dev_t, bool excl); void put_super(struct super_block *sb); extern bool mount_capable(struct fs_context *); @@ -201,7 +195,7 @@ void lock_two_inodes(struct inode *inode1, struct inode *inode2, * fs-writeback.c */ extern long get_nr_dirty_inodes(void); -extern int invalidate_inodes(struct super_block *, bool); +void invalidate_inodes(struct super_block *sb); /* * dcache.c diff --git a/fs/ioctl.c b/fs/ioctl.c index 5b2481cd4750..f5fd99d6b0d4 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -109,9 +109,6 @@ static int ioctl_fibmap(struct file *filp, int __user *p) * Returns 0 on success, -errno on error, 1 if this was the last * extent that will fit in user array. */ -#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC) -#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED) -#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE) int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, u64 phys, u64 len, u32 flags) { @@ -127,6 +124,10 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max) return 1; +#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC) +#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED) +#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE) + if (flags & SET_UNKNOWN_FLAGS) flags |= FIEMAP_EXTENT_UNKNOWN; if (flags & SET_NO_UNMOUNTED_IO_FLAGS) @@ -396,8 +397,8 @@ static int ioctl_fsfreeze(struct file *filp) /* Freeze */ if (sb->s_op->freeze_super) - return sb->s_op->freeze_super(sb); - return freeze_super(sb); + return sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE); + return freeze_super(sb, FREEZE_HOLDER_USERSPACE); } static int ioctl_fsthaw(struct file *filp) @@ -409,8 +410,8 @@ static int ioctl_fsthaw(struct file *filp) /* Thaw */ if (sb->s_op->thaw_super) - return sb->s_op->thaw_super(sb); - return thaw_super(sb); + return sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE); + return thaw_super(sb, FREEZE_HOLDER_USERSPACE); } static int ioctl_file_dedupe_range(struct file *file, @@ -877,6 +878,9 @@ out: #ifdef CONFIG_COMPAT /** * compat_ptr_ioctl - generic implementation of .compat_ioctl file operation + * @file: The file to operate on. + * @cmd: The ioctl command number. + * @arg: The argument to the ioctl. * * This is not normally called as a function, but instead set in struct * file_operations as diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index aa8967cca1a3..644479ccefbd 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -23,65 +23,169 @@ #define IOEND_BATCH_SIZE 4096 +typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length); /* - * Structure allocated for each folio when block size < folio size - * to track sub-folio uptodate status and I/O completions. + * Structure allocated for each folio to track per-block uptodate, dirty state + * and I/O completions. */ -struct iomap_page { +struct iomap_folio_state { atomic_t read_bytes_pending; atomic_t write_bytes_pending; - spinlock_t uptodate_lock; - unsigned long uptodate[]; + spinlock_t state_lock; + + /* + * Each block has two bits in this bitmap: + * Bits [0..blocks_per_folio) has the uptodate status. + * Bits [b_p_f...(2*b_p_f)) has the dirty status. + */ + unsigned long state[]; }; -static inline struct iomap_page *to_iomap_page(struct folio *folio) +static struct bio_set iomap_ioend_bioset; + +static inline bool ifs_is_fully_uptodate(struct folio *folio, + struct iomap_folio_state *ifs) { - if (folio_test_private(folio)) - return folio_get_private(folio); - return NULL; + struct inode *inode = folio->mapping->host; + + return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio)); } -static struct bio_set iomap_ioend_bioset; +static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs, + unsigned int block) +{ + return test_bit(block, ifs->state); +} + +static void ifs_set_range_uptodate(struct folio *folio, + struct iomap_folio_state *ifs, size_t off, size_t len) +{ + struct inode *inode = folio->mapping->host; + unsigned int first_blk = off >> inode->i_blkbits; + unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; + unsigned int nr_blks = last_blk - first_blk + 1; + unsigned long flags; + + spin_lock_irqsave(&ifs->state_lock, flags); + bitmap_set(ifs->state, first_blk, nr_blks); + if (ifs_is_fully_uptodate(folio, ifs)) + folio_mark_uptodate(folio); + spin_unlock_irqrestore(&ifs->state_lock, flags); +} + +static void iomap_set_range_uptodate(struct folio *folio, size_t off, + size_t len) +{ + struct iomap_folio_state *ifs = folio->private; + + if (ifs) + ifs_set_range_uptodate(folio, ifs, off, len); + else + folio_mark_uptodate(folio); +} + +static inline bool ifs_block_is_dirty(struct folio *folio, + struct iomap_folio_state *ifs, int block) +{ + struct inode *inode = folio->mapping->host; + unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); + + return test_bit(block + blks_per_folio, ifs->state); +} + +static void ifs_clear_range_dirty(struct folio *folio, + struct iomap_folio_state *ifs, size_t off, size_t len) +{ + struct inode *inode = folio->mapping->host; + unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); + unsigned int first_blk = (off >> inode->i_blkbits); + unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; + unsigned int nr_blks = last_blk - first_blk + 1; + unsigned long flags; + + spin_lock_irqsave(&ifs->state_lock, flags); + bitmap_clear(ifs->state, first_blk + blks_per_folio, nr_blks); + spin_unlock_irqrestore(&ifs->state_lock, flags); +} + +static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len) +{ + struct iomap_folio_state *ifs = folio->private; + + if (ifs) + ifs_clear_range_dirty(folio, ifs, off, len); +} + +static void ifs_set_range_dirty(struct folio *folio, + struct iomap_folio_state *ifs, size_t off, size_t len) +{ + struct inode *inode = folio->mapping->host; + unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); + unsigned int first_blk = (off >> inode->i_blkbits); + unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; + unsigned int nr_blks = last_blk - first_blk + 1; + unsigned long flags; + + spin_lock_irqsave(&ifs->state_lock, flags); + bitmap_set(ifs->state, first_blk + blks_per_folio, nr_blks); + spin_unlock_irqrestore(&ifs->state_lock, flags); +} -static struct iomap_page * -iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags) +static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len) { - struct iomap_page *iop = to_iomap_page(folio); + struct iomap_folio_state *ifs = folio->private; + + if (ifs) + ifs_set_range_dirty(folio, ifs, off, len); +} + +static struct iomap_folio_state *ifs_alloc(struct inode *inode, + struct folio *folio, unsigned int flags) +{ + struct iomap_folio_state *ifs = folio->private; unsigned int nr_blocks = i_blocks_per_folio(inode, folio); gfp_t gfp; - if (iop || nr_blocks <= 1) - return iop; + if (ifs || nr_blocks <= 1) + return ifs; if (flags & IOMAP_NOWAIT) gfp = GFP_NOWAIT; else gfp = GFP_NOFS | __GFP_NOFAIL; - iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)), - gfp); - if (iop) { - spin_lock_init(&iop->uptodate_lock); - if (folio_test_uptodate(folio)) - bitmap_fill(iop->uptodate, nr_blocks); - folio_attach_private(folio, iop); - } - return iop; + /* + * ifs->state tracks two sets of state flags when the + * filesystem block size is smaller than the folio size. + * The first state tracks per-block uptodate and the + * second tracks per-block dirty state. + */ + ifs = kzalloc(struct_size(ifs, state, + BITS_TO_LONGS(2 * nr_blocks)), gfp); + if (!ifs) + return ifs; + + spin_lock_init(&ifs->state_lock); + if (folio_test_uptodate(folio)) + bitmap_set(ifs->state, 0, nr_blocks); + if (folio_test_dirty(folio)) + bitmap_set(ifs->state, nr_blocks, nr_blocks); + folio_attach_private(folio, ifs); + + return ifs; } -static void iomap_page_release(struct folio *folio) +static void ifs_free(struct folio *folio) { - struct iomap_page *iop = folio_detach_private(folio); - struct inode *inode = folio->mapping->host; - unsigned int nr_blocks = i_blocks_per_folio(inode, folio); + struct iomap_folio_state *ifs = folio_detach_private(folio); - if (!iop) + if (!ifs) return; - WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending)); - WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending)); - WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) != + WARN_ON_ONCE(atomic_read(&ifs->read_bytes_pending)); + WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending)); + WARN_ON_ONCE(ifs_is_fully_uptodate(folio, ifs) != folio_test_uptodate(folio)); - kfree(iop); + kfree(ifs); } /* @@ -90,7 +194,7 @@ static void iomap_page_release(struct folio *folio) static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, loff_t *pos, loff_t length, size_t *offp, size_t *lenp) { - struct iomap_page *iop = to_iomap_page(folio); + struct iomap_folio_state *ifs = folio->private; loff_t orig_pos = *pos; loff_t isize = i_size_read(inode); unsigned block_bits = inode->i_blkbits; @@ -105,12 +209,12 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, * per-block uptodate status and adjust the offset and length if needed * to avoid reading in already uptodate ranges. */ - if (iop) { + if (ifs) { unsigned int i; /* move forward for each leading block marked uptodate */ for (i = first; i <= last; i++) { - if (!test_bit(i, iop->uptodate)) + if (!ifs_block_is_uptodate(ifs, i)) break; *pos += block_size; poff += block_size; @@ -120,7 +224,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, /* truncate len if we find any trailing uptodate block(s) */ for ( ; i <= last; i++) { - if (test_bit(i, iop->uptodate)) { + if (ifs_block_is_uptodate(ifs, i)) { plen -= (last - i + 1) * block_size; last = i - 1; break; @@ -144,43 +248,19 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, *lenp = plen; } -static void iomap_iop_set_range_uptodate(struct folio *folio, - struct iomap_page *iop, size_t off, size_t len) -{ - struct inode *inode = folio->mapping->host; - unsigned first = off >> inode->i_blkbits; - unsigned last = (off + len - 1) >> inode->i_blkbits; - unsigned long flags; - - spin_lock_irqsave(&iop->uptodate_lock, flags); - bitmap_set(iop->uptodate, first, last - first + 1); - if (bitmap_full(iop->uptodate, i_blocks_per_folio(inode, folio))) - folio_mark_uptodate(folio); - spin_unlock_irqrestore(&iop->uptodate_lock, flags); -} - -static void iomap_set_range_uptodate(struct folio *folio, - struct iomap_page *iop, size_t off, size_t len) -{ - if (iop) - iomap_iop_set_range_uptodate(folio, iop, off, len); - else - folio_mark_uptodate(folio); -} - static void iomap_finish_folio_read(struct folio *folio, size_t offset, size_t len, int error) { - struct iomap_page *iop = to_iomap_page(folio); + struct iomap_folio_state *ifs = folio->private; if (unlikely(error)) { folio_clear_uptodate(folio); folio_set_error(folio); } else { - iomap_set_range_uptodate(folio, iop, offset, len); + iomap_set_range_uptodate(folio, offset, len); } - if (!iop || atomic_sub_and_test(len, &iop->read_bytes_pending)) + if (!ifs || atomic_sub_and_test(len, &ifs->read_bytes_pending)) folio_unlock(folio); } @@ -213,7 +293,6 @@ struct iomap_readpage_ctx { static int iomap_read_inline_data(const struct iomap_iter *iter, struct folio *folio) { - struct iomap_page *iop; const struct iomap *iomap = iomap_iter_srcmap(iter); size_t size = i_size_read(iter->inode) - iomap->offset; size_t poff = offset_in_page(iomap->offset); @@ -231,15 +310,13 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, if (WARN_ON_ONCE(size > iomap->length)) return -EIO; if (offset > 0) - iop = iomap_page_create(iter->inode, folio, iter->flags); - else - iop = to_iomap_page(folio); + ifs_alloc(iter->inode, folio, iter->flags); addr = kmap_local_folio(folio, offset); memcpy(addr, iomap->inline_data, size); memset(addr + size, 0, PAGE_SIZE - poff - size); kunmap_local(addr); - iomap_set_range_uptodate(folio, iop, offset, PAGE_SIZE - poff); + iomap_set_range_uptodate(folio, offset, PAGE_SIZE - poff); return 0; } @@ -260,7 +337,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, loff_t pos = iter->pos + offset; loff_t length = iomap_length(iter) - offset; struct folio *folio = ctx->cur_folio; - struct iomap_page *iop; + struct iomap_folio_state *ifs; loff_t orig_pos = pos; size_t poff, plen; sector_t sector; @@ -269,20 +346,20 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, return iomap_read_inline_data(iter, folio); /* zero post-eof blocks as the page may be mapped */ - iop = iomap_page_create(iter->inode, folio, iter->flags); + ifs = ifs_alloc(iter->inode, folio, iter->flags); iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); if (plen == 0) goto done; if (iomap_block_needs_zeroing(iter, pos)) { folio_zero_range(folio, poff, plen); - iomap_set_range_uptodate(folio, iop, poff, plen); + iomap_set_range_uptodate(folio, poff, plen); goto done; } ctx->cur_folio_in_bio = true; - if (iop) - atomic_add(plen, &iop->read_bytes_pending); + if (ifs) + atomic_add(plen, &ifs->read_bytes_pending); sector = iomap_sector(iomap, pos); if (!ctx->bio || @@ -436,11 +513,11 @@ EXPORT_SYMBOL_GPL(iomap_readahead); */ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) { - struct iomap_page *iop = to_iomap_page(folio); + struct iomap_folio_state *ifs = folio->private; struct inode *inode = folio->mapping->host; unsigned first, last, i; - if (!iop) + if (!ifs) return false; /* Caller's range may extend past the end of this folio */ @@ -451,7 +528,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) last = (from + count - 1) >> inode->i_blkbits; for (i = first; i <= last; i++) - if (!test_bit(i, iop->uptodate)) + if (!ifs_block_is_uptodate(ifs, i)) return false; return true; } @@ -461,16 +538,18 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); * iomap_get_folio - get a folio reference for writing * @iter: iteration structure * @pos: start offset of write + * @len: Suggested size of folio to create. * * Returns a locked reference to the folio at @pos, or an error pointer if the * folio could not be obtained. */ -struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) { - unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS; + fgf_t fgp = FGP_WRITEBEGIN | FGP_NOFS; if (iter->flags & IOMAP_NOWAIT) fgp |= FGP_NOWAIT; + fgp |= fgf_set_order(len); return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, fgp, mapping_gfp_mask(iter->inode->i_mapping)); @@ -483,14 +562,13 @@ bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags) folio_size(folio)); /* - * mm accommodates an old ext3 case where clean folios might - * not have had the dirty bit cleared. Thus, it can send actual - * dirty folios to ->release_folio() via shrink_active_list(); - * skip those here. + * If the folio is dirty, we refuse to release our metadata because + * it may be partially dirty. Once we track per-block dirty state, + * we can release the metadata if every block is dirty. */ - if (folio_test_dirty(folio) || folio_test_writeback(folio)) + if (folio_test_dirty(folio)) return false; - iomap_page_release(folio); + ifs_free(folio); return true; } EXPORT_SYMBOL_GPL(iomap_release_folio); @@ -507,16 +585,22 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len) if (offset == 0 && len == folio_size(folio)) { WARN_ON_ONCE(folio_test_writeback(folio)); folio_cancel_dirty(folio); - iomap_page_release(folio); - } else if (folio_test_large(folio)) { - /* Must release the iop so the page can be split */ - WARN_ON_ONCE(!folio_test_uptodate(folio) && - folio_test_dirty(folio)); - iomap_page_release(folio); + ifs_free(folio); } } EXPORT_SYMBOL_GPL(iomap_invalidate_folio); +bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio) +{ + struct inode *inode = mapping->host; + size_t len = folio_size(folio); + + ifs_alloc(inode, folio, 0); + iomap_set_range_dirty(folio, 0, len); + return filemap_dirty_folio(mapping, folio); +} +EXPORT_SYMBOL_GPL(iomap_dirty_folio); + static void iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) { @@ -547,7 +631,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, size_t len, struct folio *folio) { const struct iomap *srcmap = iomap_iter_srcmap(iter); - struct iomap_page *iop; + struct iomap_folio_state *ifs; loff_t block_size = i_blocksize(iter->inode); loff_t block_start = round_down(pos, block_size); loff_t block_end = round_up(pos + len, block_size); @@ -555,14 +639,25 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, size_t from = offset_in_folio(folio, pos), to = from + len; size_t poff, plen; - if (folio_test_uptodate(folio)) + /* + * If the write or zeroing completely overlaps the current folio, then + * entire folio will be dirtied so there is no need for + * per-block state tracking structures to be attached to this folio. + * For the unshare case, we must read in the ondisk contents because we + * are not changing pagecache contents. + */ + if (!(iter->flags & IOMAP_UNSHARE) && pos <= folio_pos(folio) && + pos + len >= folio_pos(folio) + folio_size(folio)) return 0; - folio_clear_error(folio); - iop = iomap_page_create(iter->inode, folio, iter->flags); - if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1) + ifs = ifs_alloc(iter->inode, folio, iter->flags); + if ((iter->flags & IOMAP_NOWAIT) && !ifs && nr_blocks > 1) return -EAGAIN; + if (folio_test_uptodate(folio)) + return 0; + folio_clear_error(folio); + do { iomap_adjust_read_range(iter->inode, folio, &block_start, block_end - block_start, &poff, &plen); @@ -589,7 +684,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, if (status) return status; } - iomap_set_range_uptodate(folio, iop, poff, plen); + iomap_set_range_uptodate(folio, poff, plen); } while ((block_start += plen) < block_end); return 0; @@ -603,7 +698,7 @@ static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos, if (folio_ops && folio_ops->get_folio) return folio_ops->get_folio(iter, pos, len); else - return iomap_get_folio(iter, pos); + return iomap_get_folio(iter, pos, len); } static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret, @@ -696,7 +791,6 @@ out_unlock: static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, size_t copied, struct folio *folio) { - struct iomap_page *iop = to_iomap_page(folio); flush_dcache_folio(folio); /* @@ -712,7 +806,8 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, */ if (unlikely(copied < len && !folio_test_uptodate(folio))) return 0; - iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len); + iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len); + iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied); filemap_dirty_folio(inode->i_mapping, folio); return copied; } @@ -773,6 +868,7 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) { loff_t length = iomap_length(iter); + size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; loff_t pos = iter->pos; ssize_t written = 0; long status = 0; @@ -781,15 +877,12 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) do { struct folio *folio; - struct page *page; - unsigned long offset; /* Offset into pagecache page */ - unsigned long bytes; /* Bytes to write to page */ + size_t offset; /* Offset into folio */ + size_t bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ - offset = offset_in_page(pos); - bytes = min_t(unsigned long, PAGE_SIZE - offset, - iov_iter_count(i)); -again: + offset = pos & (chunk - 1); + bytes = min(chunk - offset, iov_iter_count(i)); status = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); if (unlikely(status)) @@ -819,12 +912,14 @@ again: if (iter->iomap.flags & IOMAP_F_STALE) break; - page = folio_file_page(folio, pos >> PAGE_SHIFT); - if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); + offset = offset_in_folio(folio, pos); + if (bytes > folio_size(folio) - offset) + bytes = folio_size(folio) - offset; - copied = copy_page_from_iter_atomic(page, offset, bytes, i); + if (mapping_writably_mapped(mapping)) + flush_dcache_folio(folio); + copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); status = iomap_write_end(iter, pos, bytes, copied, folio); if (unlikely(copied != status)) @@ -840,11 +935,13 @@ again: */ if (copied) bytes = copied; - goto again; + if (chunk > PAGE_SIZE) + chunk /= 2; + } else { + pos += status; + written += status; + length -= status; } - pos += status; - written += status; - length -= status; } while (iov_iter_count(i) && length); if (status == -EAGAIN) { @@ -880,6 +977,76 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); +static int iomap_write_delalloc_ifs_punch(struct inode *inode, + struct folio *folio, loff_t start_byte, loff_t end_byte, + iomap_punch_t punch) +{ + unsigned int first_blk, last_blk, i; + loff_t last_byte; + u8 blkbits = inode->i_blkbits; + struct iomap_folio_state *ifs; + int ret = 0; + + /* + * When we have per-block dirty tracking, there can be + * blocks within a folio which are marked uptodate + * but not dirty. In that case it is necessary to punch + * out such blocks to avoid leaking any delalloc blocks. + */ + ifs = folio->private; + if (!ifs) + return ret; + + last_byte = min_t(loff_t, end_byte - 1, + folio_pos(folio) + folio_size(folio) - 1); + first_blk = offset_in_folio(folio, start_byte) >> blkbits; + last_blk = offset_in_folio(folio, last_byte) >> blkbits; + for (i = first_blk; i <= last_blk; i++) { + if (!ifs_block_is_dirty(folio, ifs, i)) { + ret = punch(inode, folio_pos(folio) + (i << blkbits), + 1 << blkbits); + if (ret) + return ret; + } + } + + return ret; +} + + +static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, + loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, + iomap_punch_t punch) +{ + int ret = 0; + + if (!folio_test_dirty(folio)) + return ret; + + /* if dirty, punch up to offset */ + if (start_byte > *punch_start_byte) { + ret = punch(inode, *punch_start_byte, + start_byte - *punch_start_byte); + if (ret) + return ret; + } + + /* Punch non-dirty blocks within folio */ + ret = iomap_write_delalloc_ifs_punch(inode, folio, start_byte, + end_byte, punch); + if (ret) + return ret; + + /* + * Make sure the next punch start is correctly bound to + * the end of this data range, not the end of the folio. + */ + *punch_start_byte = min_t(loff_t, end_byte, + folio_pos(folio) + folio_size(folio)); + + return ret; +} + /* * Scan the data range passed to us for dirty page cache folios. If we find a * dirty folio, punch out the preceeding range and update the offset from which @@ -899,10 +1066,11 @@ EXPORT_SYMBOL_GPL(iomap_file_buffered_write); */ static int iomap_write_delalloc_scan(struct inode *inode, loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, - int (*punch)(struct inode *inode, loff_t offset, loff_t length)) + iomap_punch_t punch) { while (start_byte < end_byte) { struct folio *folio; + int ret; /* grab locked page */ folio = filemap_lock_folio(inode->i_mapping, @@ -913,26 +1081,12 @@ static int iomap_write_delalloc_scan(struct inode *inode, continue; } - /* if dirty, punch up to offset */ - if (folio_test_dirty(folio)) { - if (start_byte > *punch_start_byte) { - int error; - - error = punch(inode, *punch_start_byte, - start_byte - *punch_start_byte); - if (error) { - folio_unlock(folio); - folio_put(folio); - return error; - } - } - - /* - * Make sure the next punch start is correctly bound to - * the end of this data range, not the end of the folio. - */ - *punch_start_byte = min_t(loff_t, end_byte, - folio_next_index(folio) << PAGE_SHIFT); + ret = iomap_write_delalloc_punch(inode, folio, punch_start_byte, + start_byte, end_byte, punch); + if (ret) { + folio_unlock(folio); + folio_put(folio); + return ret; } /* move offset to start of next folio in range */ @@ -977,8 +1131,7 @@ static int iomap_write_delalloc_scan(struct inode *inode, * the code to subtle off-by-one bugs.... */ static int iomap_write_delalloc_release(struct inode *inode, - loff_t start_byte, loff_t end_byte, - int (*punch)(struct inode *inode, loff_t pos, loff_t length)) + loff_t start_byte, loff_t end_byte, iomap_punch_t punch) { loff_t punch_start_byte = start_byte; loff_t scan_end_byte = min(i_size_read(inode), end_byte); @@ -1071,8 +1224,7 @@ out_unlock: */ int iomap_file_buffered_write_punch_delalloc(struct inode *inode, struct iomap *iomap, loff_t pos, loff_t length, - ssize_t written, - int (*punch)(struct inode *inode, loff_t pos, loff_t length)) + ssize_t written, iomap_punch_t punch) { loff_t start_byte; loff_t end_byte; @@ -1111,7 +1263,6 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos = iter->pos; loff_t length = iomap_length(iter); - long status = 0; loff_t written = 0; /* don't bother with blocks that are not shared to start with */ @@ -1122,28 +1273,33 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) return length; do { - unsigned long offset = offset_in_page(pos); - unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length); struct folio *folio; + int status; + size_t offset; + size_t bytes = min_t(u64, SIZE_MAX, length); status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) return status; - if (iter->iomap.flags & IOMAP_F_STALE) + if (iomap->flags & IOMAP_F_STALE) break; - status = iomap_write_end(iter, pos, bytes, bytes, folio); - if (WARN_ON_ONCE(status == 0)) + offset = offset_in_folio(folio, pos); + if (bytes > folio_size(folio) - offset) + bytes = folio_size(folio) - offset; + + bytes = iomap_write_end(iter, pos, bytes, bytes, folio); + if (WARN_ON_ONCE(bytes == 0)) return -EIO; cond_resched(); - pos += status; - written += status; - length -= status; + pos += bytes; + written += bytes; + length -= bytes; balance_dirty_pages_ratelimited(iter->inode->i_mapping); - } while (length); + } while (length > 0); return written; } @@ -1286,24 +1442,24 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) return VM_FAULT_LOCKED; out_unlock: folio_unlock(folio); - return block_page_mkwrite_return(ret); + return vmf_fs_error(ret); } EXPORT_SYMBOL_GPL(iomap_page_mkwrite); static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len, int error) { - struct iomap_page *iop = to_iomap_page(folio); + struct iomap_folio_state *ifs = folio->private; if (error) { folio_set_error(folio); mapping_set_error(inode->i_mapping, error); } - WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !iop); - WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) <= 0); + WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs); + WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0); - if (!iop || atomic_sub_and_test(len, &iop->write_bytes_pending)) + if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending)) folio_end_writeback(folio); } @@ -1570,7 +1726,7 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, */ static void iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio, - struct iomap_page *iop, struct iomap_writepage_ctx *wpc, + struct iomap_folio_state *ifs, struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct list_head *iolist) { sector_t sector = iomap_sector(&wpc->iomap, pos); @@ -1588,8 +1744,8 @@ iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio, bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff); } - if (iop) - atomic_add(len, &iop->write_bytes_pending); + if (ifs) + atomic_add(len, &ifs->write_bytes_pending); wpc->ioend->io_size += len; wbc_account_cgroup_owner(wbc, &folio->page, len); } @@ -1615,7 +1771,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct inode *inode, struct folio *folio, u64 end_pos) { - struct iomap_page *iop = iomap_page_create(inode, folio, 0); + struct iomap_folio_state *ifs = folio->private; struct iomap_ioend *ioend, *next; unsigned len = i_blocksize(inode); unsigned nblocks = i_blocks_per_folio(inode, folio); @@ -1623,7 +1779,14 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, int error = 0, count = 0, i; LIST_HEAD(submit_list); - WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0); + WARN_ON_ONCE(end_pos <= pos); + + if (!ifs && nblocks > 1) { + ifs = ifs_alloc(inode, folio, 0); + iomap_set_range_dirty(folio, 0, end_pos - pos); + } + + WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0); /* * Walk through the folio to find areas to write back. If we @@ -1631,7 +1794,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, * invalid, grab a new one. */ for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) { - if (iop && !test_bit(i, iop->uptodate)) + if (ifs && !ifs_block_is_dirty(folio, ifs, i)) continue; error = wpc->ops->map_blocks(wpc, inode, pos); @@ -1642,7 +1805,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, continue; if (wpc->iomap.type == IOMAP_HOLE) continue; - iomap_add_to_ioend(inode, pos, folio, iop, wpc, wbc, + iomap_add_to_ioend(inode, pos, folio, ifs, wpc, wbc, &submit_list); count++; } @@ -1675,6 +1838,12 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, } } + /* + * We can have dirty bits set past end of file in page_mkwrite path + * while mapping the last partial folio. Hence it's better to clear + * all the dirty bits in the folio here. + */ + iomap_clear_range_dirty(folio, 0, folio_size(folio)); folio_start_writeback(folio); folio_unlock(folio); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index ea3b868c8355..bcd3f8cf5ea4 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -20,10 +20,12 @@ * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: */ -#define IOMAP_DIO_WRITE_FUA (1 << 28) -#define IOMAP_DIO_NEED_SYNC (1 << 29) -#define IOMAP_DIO_WRITE (1 << 30) -#define IOMAP_DIO_DIRTY (1 << 31) +#define IOMAP_DIO_CALLER_COMP (1U << 26) +#define IOMAP_DIO_INLINE_COMP (1U << 27) +#define IOMAP_DIO_WRITE_THROUGH (1U << 28) +#define IOMAP_DIO_NEED_SYNC (1U << 29) +#define IOMAP_DIO_WRITE (1U << 30) +#define IOMAP_DIO_DIRTY (1U << 31) struct iomap_dio { struct kiocb *iocb; @@ -41,7 +43,6 @@ struct iomap_dio { struct { struct iov_iter *iter; struct task_struct *waiter; - struct bio *poll_bio; } submit; /* used for aio completion: */ @@ -63,12 +64,14 @@ static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter, static void iomap_dio_submit_bio(const struct iomap_iter *iter, struct iomap_dio *dio, struct bio *bio, loff_t pos) { + struct kiocb *iocb = dio->iocb; + atomic_inc(&dio->ref); /* Sync dio can't be polled reliably */ - if ((dio->iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(dio->iocb)) { - bio_set_polled(bio, dio->iocb); - dio->submit.poll_bio = bio; + if ((iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(iocb)) { + bio_set_polled(bio, iocb); + WRITE_ONCE(iocb->private, bio); } if (dio->dops && dio->dops->submit_io) @@ -130,6 +133,11 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) } EXPORT_SYMBOL_GPL(iomap_dio_complete); +static ssize_t iomap_dio_deferred_complete(void *data) +{ + return iomap_dio_complete(data); +} + static void iomap_dio_complete_work(struct work_struct *work) { struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); @@ -152,27 +160,69 @@ void iomap_dio_bio_end_io(struct bio *bio) { struct iomap_dio *dio = bio->bi_private; bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + struct kiocb *iocb = dio->iocb; if (bio->bi_status) iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); + if (!atomic_dec_and_test(&dio->ref)) + goto release_bio; - if (atomic_dec_and_test(&dio->ref)) { - if (dio->wait_for_completion) { - struct task_struct *waiter = dio->submit.waiter; - WRITE_ONCE(dio->submit.waiter, NULL); - blk_wake_io_task(waiter); - } else if (dio->flags & IOMAP_DIO_WRITE) { - struct inode *inode = file_inode(dio->iocb->ki_filp); - - WRITE_ONCE(dio->iocb->private, NULL); - INIT_WORK(&dio->aio.work, iomap_dio_complete_work); - queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); - } else { - WRITE_ONCE(dio->iocb->private, NULL); - iomap_dio_complete_work(&dio->aio.work); - } + /* + * Synchronous dio, task itself will handle any completion work + * that needs after IO. All we need to do is wake the task. + */ + if (dio->wait_for_completion) { + struct task_struct *waiter = dio->submit.waiter; + + WRITE_ONCE(dio->submit.waiter, NULL); + blk_wake_io_task(waiter); + goto release_bio; + } + + /* + * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline + */ + if (dio->flags & IOMAP_DIO_INLINE_COMP) { + WRITE_ONCE(iocb->private, NULL); + iomap_dio_complete_work(&dio->aio.work); + goto release_bio; + } + + /* + * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule + * our completion that way to avoid an async punt to a workqueue. + */ + if (dio->flags & IOMAP_DIO_CALLER_COMP) { + /* only polled IO cares about private cleared */ + iocb->private = dio; + iocb->dio_complete = iomap_dio_deferred_complete; + + /* + * Invoke ->ki_complete() directly. We've assigned our + * dio_complete callback handler, and since the issuer set + * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will + * notice ->dio_complete being set and will defer calling that + * handler until it can be done from a safe task context. + * + * Note that the 'res' being passed in here is not important + * for this case. The actual completion value of the request + * will be gotten from dio_complete when that is run by the + * issuer. + */ + iocb->ki_complete(iocb, 0); + goto release_bio; } + /* + * Async DIO completion that requires filesystem level completion work + * gets punted to a work queue to complete as the operation may require + * more IO to be issued to finalise filesystem metadata changes or + * guarantee data integrity. + */ + INIT_WORK(&dio->aio.work, iomap_dio_complete_work); + queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq, + &dio->aio.work); +release_bio: if (should_dirty) { bio_check_pages_dirty(bio); } else { @@ -203,7 +253,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, /* * Figure out the bio's operation flags from the dio request, the * mapping, and whether or not we want FUA. Note that we can end up - * clearing the WRITE_FUA flag in the dio request. + * clearing the WRITE_THROUGH flag in the dio request. */ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, const struct iomap *iomap, bool use_fua) @@ -217,7 +267,7 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, if (use_fua) opflags |= REQ_FUA; else - dio->flags &= ~IOMAP_DIO_WRITE_FUA; + dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; return opflags; } @@ -257,12 +307,19 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, * Use a FUA write if we need datasync semantics, this is a pure * data IO that doesn't require any metadata updates (including * after IO completion such as unwritten extent conversion) and - * the underlying device supports FUA. This allows us to avoid - * cache flushes on IO completion. + * the underlying device either supports FUA or doesn't have + * a volatile write cache. This allows us to avoid cache flushes + * on IO completion. If we can't use writethrough and need to + * sync, disable in-task completions as dio completion will + * need to call generic_write_sync() which will do a blocking + * fsync / cache flush call. */ if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && - (dio->flags & IOMAP_DIO_WRITE_FUA) && bdev_fua(iomap->bdev)) + (dio->flags & IOMAP_DIO_WRITE_THROUGH) && + (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev))) use_fua = true; + else if (dio->flags & IOMAP_DIO_NEED_SYNC) + dio->flags &= ~IOMAP_DIO_CALLER_COMP; } /* @@ -277,10 +334,23 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, goto out; /* - * We can only poll for single bio I/Os. + * We can only do deferred completion for pure overwrites that + * don't require additional IO at completion. This rules out + * writes that need zeroing or extent conversion, extend + * the file size, or issue journal IO or cache flushes + * during completion processing. */ if (need_zeroout || + ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) || ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) + dio->flags &= ~IOMAP_DIO_CALLER_COMP; + + /* + * The rules for polled IO completions follow the guidelines as the + * ones we set for inline and deferred completions. If none of those + * are available for this IO, clear the polled flag. + */ + if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP))) dio->iocb->ki_flags &= ~IOCB_HIPRI; if (need_zeroout) { @@ -505,12 +575,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->submit.iter = iter; dio->submit.waiter = current; - dio->submit.poll_bio = NULL; if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; if (iov_iter_rw(iter) == READ) { + /* reads can always complete inline */ + dio->flags |= IOMAP_DIO_INLINE_COMP; + if (iomi.pos >= dio->i_size) goto out_free_dio; @@ -524,6 +596,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, iomi.flags |= IOMAP_WRITE; dio->flags |= IOMAP_DIO_WRITE; + /* + * Flag as supporting deferred completions, if the issuer + * groks it. This can avoid a workqueue punt for writes. + * We may later clear this flag if we need to do other IO + * as part of this IO completion. + */ + if (iocb->ki_flags & IOCB_DIO_CALLER_COMP) + dio->flags |= IOMAP_DIO_CALLER_COMP; + if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { ret = -EAGAIN; if (iomi.pos >= dio->i_size || @@ -537,13 +618,16 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->flags |= IOMAP_DIO_NEED_SYNC; /* - * For datasync only writes, we optimistically try - * using FUA for this IO. Any non-FUA write that - * occurs will clear this flag, hence we know before - * completion whether a cache flush is necessary. + * For datasync only writes, we optimistically try using + * WRITE_THROUGH for this IO. This flag requires either + * FUA writes through the device's write cache, or a + * normal write to a device without a volatile write + * cache. For the former, Any non-FUA write that occurs + * will clear this flag, hence we know before completion + * whether a cache flush is necessary. */ if (!(iocb->ki_flags & IOCB_SYNC)) - dio->flags |= IOMAP_DIO_WRITE_FUA; + dio->flags |= IOMAP_DIO_WRITE_THROUGH; } /* @@ -605,14 +689,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, iomap_dio_set_error(dio, ret); /* - * If all the writes we issued were FUA, we don't need to flush the - * cache on IO completion. Clear the sync flag for this case. + * If all the writes we issued were already written through to the + * media, we don't need to flush the cache on IO completion. Clear the + * sync flag for this case. */ - if (dio->flags & IOMAP_DIO_WRITE_FUA) + if (dio->flags & IOMAP_DIO_WRITE_THROUGH) dio->flags &= ~IOMAP_DIO_NEED_SYNC; - WRITE_ONCE(iocb->private, dio->submit.poll_bio); - /* * We are about to drop our additional submission reference, which * might be the last reference to the dio. There are three different diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig index 08ffd37b9bb8..51434f2a471b 100644 --- a/fs/isofs/Kconfig +++ b/fs/isofs/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config ISO9660_FS tristate "ISO 9660 CDROM file system support" + select BUFFER_HEAD help This is the standard file system used on CD-ROMs. It was previously known as "High Sierra File System" and is called "hsfs" on other diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index df9d70588b60..2ee21286ac8f 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1422,13 +1422,8 @@ static int isofs_read_inode(struct inode *inode, int relocated) inode->i_ino, de->flags[-high_sierra]); } #endif - - inode->i_mtime.tv_sec = - inode->i_atime.tv_sec = - inode->i_ctime.tv_sec = iso_date(de->date, high_sierra); - inode->i_mtime.tv_nsec = - inode->i_atime.tv_nsec = - inode->i_ctime.tv_nsec = 0; + inode->i_mtime = inode->i_atime = + inode_set_ctime(inode, iso_date(de->date, high_sierra), 0); ei->i_first_extent = (isonum_733(de->extent) + isonum_711(de->ext_attr_length)); diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index 48f58c6c9e69..348783a70f57 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -421,10 +421,9 @@ repeat: /* Rock ridge never appears on a High Sierra disk */ cnt = 0; if (rr->u.TF.flags & TF_CREATE) { - inode->i_ctime.tv_sec = - iso_date(rr->u.TF.times[cnt++].time, - 0); - inode->i_ctime.tv_nsec = 0; + inode_set_ctime(inode, + iso_date(rr->u.TF.times[cnt++].time, 0), + 0); } if (rr->u.TF.flags & TF_MODIFY) { inode->i_mtime.tv_sec = @@ -439,10 +438,9 @@ repeat: inode->i_atime.tv_nsec = 0; } if (rr->u.TF.flags & TF_ATTRIBUTES) { - inode->i_ctime.tv_sec = - iso_date(rr->u.TF.times[cnt++].time, - 0); - inode->i_ctime.tv_nsec = 0; + inode_set_ctime(inode, + iso_date(rr->u.TF.times[cnt++].time, 0), + 0); } break; case SIG('S', 'L'): @@ -534,7 +532,7 @@ repeat: inode->i_size = reloc->i_size; inode->i_blocks = reloc->i_blocks; inode->i_atime = reloc->i_atime; - inode->i_ctime = reloc->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(reloc)); inode->i_mtime = reloc->i_mtime; iput(reloc); break; diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 9ec91017a7f3..118699fff2f9 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -41,18 +41,6 @@ static inline void __buffer_unlink(struct journal_head *jh) } /* - * Check a checkpoint buffer could be release or not. - * - * Requires j_list_lock - */ -static inline bool __cp_buffer_busy(struct journal_head *jh) -{ - struct buffer_head *bh = jh2bh(jh); - - return (jh->b_transaction || buffer_locked(bh) || buffer_dirty(bh)); -} - -/* * __jbd2_log_wait_for_space: wait until there is space in the journal. * * Called under j-state_lock *only*. It will be unlocked if we have to wait @@ -349,6 +337,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal) /* Checkpoint list management */ +enum shrink_type {SHRINK_DESTROY, SHRINK_BUSY_STOP, SHRINK_BUSY_SKIP}; + /* * journal_shrink_one_cp_list * @@ -360,7 +350,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * Called with j_list_lock held. */ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, - bool destroy, bool *released) + enum shrink_type type, + bool *released) { struct journal_head *last_jh; struct journal_head *next_jh = jh; @@ -376,12 +367,15 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, jh = next_jh; next_jh = jh->b_cpnext; - if (destroy) { + if (type == SHRINK_DESTROY) { ret = __jbd2_journal_remove_checkpoint(jh); } else { ret = jbd2_journal_try_remove_checkpoint(jh); - if (ret < 0) - continue; + if (ret < 0) { + if (type == SHRINK_BUSY_SKIP) + continue; + break; + } } nr_freed++; @@ -445,7 +439,7 @@ again: tid = transaction->t_tid; freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list, - false, &released); + SHRINK_BUSY_SKIP, &released); nr_freed += freed; (*nr_to_scan) -= min(*nr_to_scan, freed); if (*nr_to_scan == 0) @@ -485,19 +479,21 @@ out: void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) { transaction_t *transaction, *last_transaction, *next_transaction; + enum shrink_type type; bool released; transaction = journal->j_checkpoint_transactions; if (!transaction) return; + type = destroy ? SHRINK_DESTROY : SHRINK_BUSY_STOP; last_transaction = transaction->t_cpprev; next_transaction = transaction; do { transaction = next_transaction; next_transaction = transaction->t_cpnext; journal_shrink_one_cp_list(transaction->t_checkpoint_list, - destroy, &released); + type, &released); /* * This function only frees up some memory if possible so we * dont have an obligation to finish processing. Bail out if @@ -631,6 +627,8 @@ int jbd2_journal_try_remove_checkpoint(struct journal_head *jh) { struct buffer_head *bh = jh2bh(jh); + if (jh->b_transaction) + return -EBUSY; if (!trylock_buffer(bh)) return -EBUSY; if (buffer_dirty(bh)) { diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 1073259902a6..8d6f934c3d95 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -298,14 +298,12 @@ static int journal_finish_inode_data_buffers(journal_t *journal, static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) { - struct page *page = bh->b_page; char *addr; __u32 checksum; - addr = kmap_atomic(page); - checksum = crc32_be(crc32_sum, - (void *)(addr + offset_in_page(bh->b_data)), bh->b_size); - kunmap_atomic(addr); + addr = kmap_local_folio(bh->b_folio, bh_offset(bh)); + checksum = crc32_be(crc32_sum, addr, bh->b_size); + kunmap_local(addr); return checksum; } @@ -322,7 +320,6 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, struct buffer_head *bh, __u32 sequence) { journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; - struct page *page = bh->b_page; __u8 *addr; __u32 csum32; __be32 seq; @@ -331,11 +328,10 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, return; seq = cpu_to_be32(sequence); - addr = kmap_atomic(page); + addr = kmap_local_folio(bh->b_folio, bh_offset(bh)); csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); - csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data), - bh->b_size); - kunmap_atomic(addr); + csum32 = jbd2_chksum(j, csum32, addr, bh->b_size); + kunmap_local(addr); if (jbd2_has_feature_csum3(j)) tag3->t_checksum = cpu_to_be32(csum32); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index fbce16fedaa4..30dec2bd2ecc 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -115,14 +115,6 @@ void __jbd2_debug(int level, const char *file, const char *func, #endif /* Checksumming functions */ -static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) -{ - if (!jbd2_journal_has_csum_v2or3_feature(j)) - return 1; - - return sb->s_checksum_type == JBD2_CRC32C_CHKSUM; -} - static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) { __u32 csum; @@ -341,7 +333,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, int do_escape = 0; char *mapped_data; struct buffer_head *new_bh; - struct page *new_page; + struct folio *new_folio; unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); journal_t *journal = transaction->t_journal; @@ -370,14 +362,14 @@ repeat: */ if (jh_in->b_frozen_data) { done_copy_out = 1; - new_page = virt_to_page(jh_in->b_frozen_data); - new_offset = offset_in_page(jh_in->b_frozen_data); + new_folio = virt_to_folio(jh_in->b_frozen_data); + new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data); } else { - new_page = jh2bh(jh_in)->b_page; - new_offset = offset_in_page(jh2bh(jh_in)->b_data); + new_folio = jh2bh(jh_in)->b_folio; + new_offset = offset_in_folio(new_folio, jh2bh(jh_in)->b_data); } - mapped_data = kmap_atomic(new_page); + mapped_data = kmap_local_folio(new_folio, new_offset); /* * Fire data frozen trigger if data already wasn't frozen. Do this * before checking for escaping, as the trigger may modify the magic @@ -385,18 +377,17 @@ repeat: * data in the buffer. */ if (!done_copy_out) - jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset, + jbd2_buffer_frozen_trigger(jh_in, mapped_data, jh_in->b_triggers); /* * Check for escaping */ - if (*((__be32 *)(mapped_data + new_offset)) == - cpu_to_be32(JBD2_MAGIC_NUMBER)) { + if (*((__be32 *)mapped_data) == cpu_to_be32(JBD2_MAGIC_NUMBER)) { need_copy_out = 1; do_escape = 1; } - kunmap_atomic(mapped_data); + kunmap_local(mapped_data); /* * Do we need to do a data copy? @@ -417,12 +408,10 @@ repeat: } jh_in->b_frozen_data = tmp; - mapped_data = kmap_atomic(new_page); - memcpy(tmp, mapped_data + new_offset, bh_in->b_size); - kunmap_atomic(mapped_data); + memcpy_from_folio(tmp, new_folio, new_offset, bh_in->b_size); - new_page = virt_to_page(tmp); - new_offset = offset_in_page(tmp); + new_folio = virt_to_folio(tmp); + new_offset = offset_in_folio(new_folio, tmp); done_copy_out = 1; /* @@ -438,12 +427,12 @@ repeat: * copying, we can finally do so. */ if (do_escape) { - mapped_data = kmap_atomic(new_page); - *((unsigned int *)(mapped_data + new_offset)) = 0; - kunmap_atomic(mapped_data); + mapped_data = kmap_local_folio(new_folio, new_offset); + *((unsigned int *)mapped_data) = 0; + kunmap_local(mapped_data); } - set_bh_page(new_bh, new_page, new_offset); + folio_set_bh(new_bh, new_folio, new_offset); new_bh->b_size = bh_in->b_size; new_bh->b_bdev = journal->j_dev; new_bh->b_blocknr = blocknr; @@ -1337,6 +1326,189 @@ static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink, } /* + * If the journal init or create aborts, we need to mark the journal + * superblock as being NULL to prevent the journal destroy from writing + * back a bogus superblock. + */ +static void journal_fail_superblock(journal_t *journal) +{ + struct buffer_head *bh = journal->j_sb_buffer; + brelse(bh); + journal->j_sb_buffer = NULL; +} + +/* + * Check the superblock for a given journal, performing initial + * validation of the format. + */ +static int journal_check_superblock(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + int num_fc_blks; + int err = -EINVAL; + + if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) || + sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { + printk(KERN_WARNING "JBD2: no valid journal superblock found\n"); + return err; + } + + if (be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V1 && + be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V2) { + printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n"); + return err; + } + + if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) { + printk(KERN_WARNING "JBD2: journal file too short\n"); + return err; + } + + if (be32_to_cpu(sb->s_first) == 0 || + be32_to_cpu(sb->s_first) >= journal->j_total_len) { + printk(KERN_WARNING + "JBD2: Invalid start block of journal: %u\n", + be32_to_cpu(sb->s_first)); + return err; + } + + /* + * If this is a V2 superblock, then we have to check the + * features flags on it. + */ + if (!jbd2_format_support_feature(journal)) + return 0; + + if ((sb->s_feature_ro_compat & + ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) || + (sb->s_feature_incompat & + ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) { + printk(KERN_WARNING "JBD2: Unrecognised features on journal\n"); + return err; + } + + num_fc_blks = jbd2_has_feature_fast_commit(journal) ? + jbd2_journal_get_num_fc_blks(sb) : 0; + if (be32_to_cpu(sb->s_maxlen) < JBD2_MIN_JOURNAL_BLOCKS || + be32_to_cpu(sb->s_maxlen) - JBD2_MIN_JOURNAL_BLOCKS < num_fc_blks) { + printk(KERN_ERR "JBD2: journal file too short %u,%d\n", + be32_to_cpu(sb->s_maxlen), num_fc_blks); + return err; + } + + if (jbd2_has_feature_csum2(journal) && + jbd2_has_feature_csum3(journal)) { + /* Can't have checksum v2 and v3 at the same time! */ + printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 " + "at the same time!\n"); + return err; + } + + if (jbd2_journal_has_csum_v2or3_feature(journal) && + jbd2_has_feature_checksum(journal)) { + /* Can't have checksum v1 and v2 on at the same time! */ + printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " + "at the same time!\n"); + return err; + } + + /* Load the checksum driver */ + if (jbd2_journal_has_csum_v2or3_feature(journal)) { + if (sb->s_checksum_type != JBD2_CRC32C_CHKSUM) { + printk(KERN_ERR "JBD2: Unknown checksum type\n"); + return err; + } + + journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(journal->j_chksum_driver)) { + printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); + err = PTR_ERR(journal->j_chksum_driver); + journal->j_chksum_driver = NULL; + return err; + } + /* Check superblock checksum */ + if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) { + printk(KERN_ERR "JBD2: journal checksum error\n"); + err = -EFSBADCRC; + return err; + } + } + + return 0; +} + +static int journal_revoke_records_per_block(journal_t *journal) +{ + int record_size; + int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t); + + if (jbd2_has_feature_64bit(journal)) + record_size = 8; + else + record_size = 4; + + if (jbd2_journal_has_csum_v2or3(journal)) + space -= sizeof(struct jbd2_journal_block_tail); + return space / record_size; +} + +/* + * Load the on-disk journal superblock and read the key fields into the + * journal_t. + */ +static int journal_load_superblock(journal_t *journal) +{ + int err; + struct buffer_head *bh; + journal_superblock_t *sb; + + bh = getblk_unmovable(journal->j_dev, journal->j_blk_offset, + journal->j_blocksize); + if (bh) + err = bh_read(bh, 0); + if (!bh || err < 0) { + pr_err("%s: Cannot read journal superblock\n", __func__); + brelse(bh); + return -EIO; + } + + journal->j_sb_buffer = bh; + sb = (journal_superblock_t *)bh->b_data; + journal->j_superblock = sb; + err = journal_check_superblock(journal); + if (err) { + journal_fail_superblock(journal); + return err; + } + + journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); + journal->j_tail = be32_to_cpu(sb->s_start); + journal->j_first = be32_to_cpu(sb->s_first); + journal->j_errno = be32_to_cpu(sb->s_errno); + journal->j_last = be32_to_cpu(sb->s_maxlen); + + if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len) + journal->j_total_len = be32_to_cpu(sb->s_maxlen); + /* Precompute checksum seed for all metadata */ + if (jbd2_journal_has_csum_v2or3(journal)) + journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + sizeof(sb->s_uuid)); + journal->j_revoke_records_per_block = + journal_revoke_records_per_block(journal); + + if (jbd2_has_feature_fast_commit(journal)) { + journal->j_fc_last = be32_to_cpu(sb->s_maxlen); + journal->j_last = journal->j_fc_last - + jbd2_journal_get_num_fc_blks(sb); + journal->j_fc_first = journal->j_last + 1; + journal->j_fc_off = 0; + } + + return 0; +} + + +/* * Management for journal control blocks: functions to create and * destroy journal_t structures, and to initialise and read existing * journal blocks from disk. */ @@ -1352,12 +1524,21 @@ static journal_t *journal_init_common(struct block_device *bdev, static struct lock_class_key jbd2_trans_commit_key; journal_t *journal; int err; - struct buffer_head *bh; int n; journal = kzalloc(sizeof(*journal), GFP_KERNEL); if (!journal) - return NULL; + return ERR_PTR(-ENOMEM); + + journal->j_blocksize = blocksize; + journal->j_dev = bdev; + journal->j_fs_dev = fs_dev; + journal->j_blk_offset = start; + journal->j_total_len = len; + + err = journal_load_superblock(journal); + if (err) + goto err_cleanup; init_waitqueue_head(&journal->j_wait_transaction_locked); init_waitqueue_head(&journal->j_wait_done_commit); @@ -1370,12 +1551,15 @@ static journal_t *journal_init_common(struct block_device *bdev, mutex_init(&journal->j_checkpoint_mutex); spin_lock_init(&journal->j_revoke_lock); spin_lock_init(&journal->j_list_lock); + spin_lock_init(&journal->j_history_lock); rwlock_init(&journal->j_state_lock); journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); journal->j_min_batch_time = 0; journal->j_max_batch_time = 15000; /* 15ms */ atomic_set(&journal->j_reserved_credits, 0); + lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle", + &jbd2_trans_commit_key, 0); /* The journal is marked for error until we succeed with recovery! */ journal->j_flags = JBD2_ABORT; @@ -1385,18 +1569,11 @@ static journal_t *journal_init_common(struct block_device *bdev, if (err) goto err_cleanup; - spin_lock_init(&journal->j_history_lock); - - lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle", - &jbd2_trans_commit_key, 0); - - /* journal descriptor can store up to n blocks -bzzz */ - journal->j_blocksize = blocksize; - journal->j_dev = bdev; - journal->j_fs_dev = fs_dev; - journal->j_blk_offset = start; - journal->j_total_len = len; - /* We need enough buffers to write out full descriptor block. */ + /* + * journal descriptor can store up to n blocks, we need enough + * buffers to write out full descriptor block. + */ + err = -ENOMEM; n = journal->j_blocksize / jbd2_min_tag_size(); journal->j_wbufsize = n; journal->j_fc_wbuf = NULL; @@ -1405,37 +1582,32 @@ static journal_t *journal_init_common(struct block_device *bdev, if (!journal->j_wbuf) goto err_cleanup; - bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize); - if (!bh) { - pr_err("%s: Cannot get buffer for journal superblock\n", - __func__); + err = percpu_counter_init(&journal->j_checkpoint_jh_count, 0, + GFP_KERNEL); + if (err) goto err_cleanup; - } - journal->j_sb_buffer = bh; - journal->j_superblock = (journal_superblock_t *)bh->b_data; journal->j_shrink_transaction = NULL; journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan; journal->j_shrinker.count_objects = jbd2_journal_shrink_count; journal->j_shrinker.seeks = DEFAULT_SEEKS; journal->j_shrinker.batch = journal->j_max_transaction_buffers; - - if (percpu_counter_init(&journal->j_checkpoint_jh_count, 0, GFP_KERNEL)) + err = register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)", + MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); + if (err) goto err_cleanup; - if (register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)", - MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev))) { - percpu_counter_destroy(&journal->j_checkpoint_jh_count); - goto err_cleanup; - } return journal; err_cleanup: - brelse(journal->j_sb_buffer); + percpu_counter_destroy(&journal->j_checkpoint_jh_count); + if (journal->j_chksum_driver) + crypto_free_shash(journal->j_chksum_driver); kfree(journal->j_wbuf); jbd2_journal_destroy_revoke(journal); + journal_fail_superblock(journal); kfree(journal); - return NULL; + return ERR_PTR(err); } /* jbd2_journal_init_dev and jbd2_journal_init_inode: @@ -1468,8 +1640,8 @@ journal_t *jbd2_journal_init_dev(struct block_device *bdev, journal_t *journal; journal = journal_init_common(bdev, fs_dev, start, len, blocksize); - if (!journal) - return NULL; + if (IS_ERR(journal)) + return ERR_CAST(journal); snprintf(journal->j_devname, sizeof(journal->j_devname), "%pg", journal->j_dev); @@ -1495,11 +1667,9 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) blocknr = 0; err = bmap(inode, &blocknr); - if (err || !blocknr) { - pr_err("%s: Cannot locate journal superblock\n", - __func__); - return NULL; + pr_err("%s: Cannot locate journal superblock\n", __func__); + return err ? ERR_PTR(err) : ERR_PTR(-EINVAL); } jbd2_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n", @@ -1509,8 +1679,8 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) journal = journal_init_common(inode->i_sb->s_bdev, inode->i_sb->s_bdev, blocknr, inode->i_size >> inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); - if (!journal) - return NULL; + if (IS_ERR(journal)) + return ERR_CAST(journal); journal->j_inode = inode; snprintf(journal->j_devname, sizeof(journal->j_devname), @@ -1522,18 +1692,6 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) } /* - * If the journal init or create aborts, we need to mark the journal - * superblock as being NULL to prevent the journal destroy from writing - * back a bogus superblock. - */ -static void journal_fail_superblock(journal_t *journal) -{ - struct buffer_head *bh = journal->j_sb_buffer; - brelse(bh); - journal->j_sb_buffer = NULL; -} - -/* * Given a journal_t structure, initialise the various fields for * startup of a new journaling session. We use this both when creating * a journal, and after recovering an old journal to reset it for @@ -1889,163 +2047,6 @@ void jbd2_journal_update_sb_errno(journal_t *journal) } EXPORT_SYMBOL(jbd2_journal_update_sb_errno); -static int journal_revoke_records_per_block(journal_t *journal) -{ - int record_size; - int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t); - - if (jbd2_has_feature_64bit(journal)) - record_size = 8; - else - record_size = 4; - - if (jbd2_journal_has_csum_v2or3(journal)) - space -= sizeof(struct jbd2_journal_block_tail); - return space / record_size; -} - -/* - * Read the superblock for a given journal, performing initial - * validation of the format. - */ -static int journal_get_superblock(journal_t *journal) -{ - struct buffer_head *bh; - journal_superblock_t *sb; - int err; - - bh = journal->j_sb_buffer; - - J_ASSERT(bh != NULL); - if (buffer_verified(bh)) - return 0; - - err = bh_read(bh, 0); - if (err < 0) { - printk(KERN_ERR - "JBD2: IO error reading journal superblock\n"); - goto out; - } - - sb = journal->j_superblock; - - err = -EINVAL; - - if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) || - sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { - printk(KERN_WARNING "JBD2: no valid journal superblock found\n"); - goto out; - } - - if (be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V1 && - be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V2) { - printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n"); - goto out; - } - - if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) { - printk(KERN_WARNING "JBD2: journal file too short\n"); - goto out; - } - - if (be32_to_cpu(sb->s_first) == 0 || - be32_to_cpu(sb->s_first) >= journal->j_total_len) { - printk(KERN_WARNING - "JBD2: Invalid start block of journal: %u\n", - be32_to_cpu(sb->s_first)); - goto out; - } - - if (jbd2_has_feature_csum2(journal) && - jbd2_has_feature_csum3(journal)) { - /* Can't have checksum v2 and v3 at the same time! */ - printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 " - "at the same time!\n"); - goto out; - } - - if (jbd2_journal_has_csum_v2or3_feature(journal) && - jbd2_has_feature_checksum(journal)) { - /* Can't have checksum v1 and v2 on at the same time! */ - printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " - "at the same time!\n"); - goto out; - } - - if (!jbd2_verify_csum_type(journal, sb)) { - printk(KERN_ERR "JBD2: Unknown checksum type\n"); - goto out; - } - - /* Load the checksum driver */ - if (jbd2_journal_has_csum_v2or3_feature(journal)) { - journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); - if (IS_ERR(journal->j_chksum_driver)) { - printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); - err = PTR_ERR(journal->j_chksum_driver); - journal->j_chksum_driver = NULL; - goto out; - } - /* Check superblock checksum */ - if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) { - printk(KERN_ERR "JBD2: journal checksum error\n"); - err = -EFSBADCRC; - goto out; - } - } - set_buffer_verified(bh); - return 0; - -out: - journal_fail_superblock(journal); - return err; -} - -/* - * Load the on-disk journal superblock and read the key fields into the - * journal_t. - */ - -static int load_superblock(journal_t *journal) -{ - int err; - journal_superblock_t *sb; - int num_fc_blocks; - - err = journal_get_superblock(journal); - if (err) - return err; - - sb = journal->j_superblock; - - journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); - journal->j_tail = be32_to_cpu(sb->s_start); - journal->j_first = be32_to_cpu(sb->s_first); - journal->j_errno = be32_to_cpu(sb->s_errno); - journal->j_last = be32_to_cpu(sb->s_maxlen); - - if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len) - journal->j_total_len = be32_to_cpu(sb->s_maxlen); - /* Precompute checksum seed for all metadata */ - if (jbd2_journal_has_csum_v2or3(journal)) - journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, - sizeof(sb->s_uuid)); - journal->j_revoke_records_per_block = - journal_revoke_records_per_block(journal); - - if (jbd2_has_feature_fast_commit(journal)) { - journal->j_fc_last = be32_to_cpu(sb->s_maxlen); - num_fc_blocks = jbd2_journal_get_num_fc_blks(sb); - if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS) - journal->j_last = journal->j_fc_last - num_fc_blocks; - journal->j_fc_first = journal->j_last + 1; - journal->j_fc_off = 0; - } - - return 0; -} - - /** * jbd2_journal_load() - Read journal from disk. * @journal: Journal to act on. @@ -2057,28 +2058,7 @@ static int load_superblock(journal_t *journal) int jbd2_journal_load(journal_t *journal) { int err; - journal_superblock_t *sb; - - err = load_superblock(journal); - if (err) - return err; - - sb = journal->j_superblock; - - /* - * If this is a V2 superblock, then we have to check the - * features flags on it. - */ - if (jbd2_format_support_feature(journal)) { - if ((sb->s_feature_ro_compat & - ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) || - (sb->s_feature_incompat & - ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) { - printk(KERN_WARNING - "JBD2: Unrecognised features on journal\n"); - return -EINVAL; - } - } + journal_superblock_t *sb = journal->j_superblock; /* * Create a slab for this blocksize @@ -2089,8 +2069,11 @@ int jbd2_journal_load(journal_t *journal) /* Let the recovery code check whether it needs to recover any * data from the journal. */ - if (jbd2_journal_recover(journal)) - goto recovery_error; + err = jbd2_journal_recover(journal); + if (err) { + pr_warn("JBD2: journal recovery failed\n"); + return err; + } if (journal->j_failed_commit) { printk(KERN_ERR "JBD2: journal transaction %u on %s " @@ -2107,15 +2090,14 @@ int jbd2_journal_load(journal_t *journal) /* OK, we've finished with the dynamic journal bits: * reinitialise the dynamic contents of the superblock in memory * and reset them on disk. */ - if (journal_reset(journal)) - goto recovery_error; + err = journal_reset(journal); + if (err) { + pr_warn("JBD2: journal reset failed\n"); + return err; + } journal->j_flags |= JBD2_LOADED; return 0; - -recovery_error: - printk(KERN_WARNING "JBD2: recovery failed\n"); - return -EIO; } /** @@ -2227,8 +2209,6 @@ int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat, if (!compat && !ro && !incompat) return 1; - if (journal_get_superblock(journal)) - return 0; if (!jbd2_format_support_feature(journal)) return 0; @@ -2518,16 +2498,12 @@ out: int jbd2_journal_wipe(journal_t *journal, int write) { - int err = 0; + int err; J_ASSERT (!(journal->j_flags & JBD2_LOADED)); - err = load_superblock(journal); - if (err) - return err; - if (!journal->j_tail) - goto no_recovery; + return 0; printk(KERN_WARNING "JBD2: %s recovery information on journal\n", write ? "Clearing" : "Ignoring"); @@ -2540,7 +2516,6 @@ int jbd2_journal_wipe(journal_t *journal, int write) mutex_unlock(&journal->j_checkpoint_mutex); } - no_recovery: return err; } diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 0184931d47f7..c269a7d29a46 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -230,12 +230,8 @@ static int count_tags(journal_t *journal, struct buffer_head *bh) /* Make sure we wrap around the log correctly! */ #define wrap(journal, var) \ do { \ - unsigned long _wrap_last = \ - jbd2_has_feature_fast_commit(journal) ? \ - (journal)->j_fc_last : (journal)->j_last; \ - \ - if (var >= _wrap_last) \ - var -= (_wrap_last - (journal)->j_first); \ + if (var >= (journal)->j_last) \ + var -= ((journal)->j_last - (journal)->j_first); \ } while (0) static int fc_do_one_pass(journal_t *journal, @@ -524,9 +520,7 @@ static int do_one_pass(journal_t *journal, break; jbd2_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", - next_commit_ID, next_log_block, - jbd2_has_feature_fast_commit(journal) ? - journal->j_fc_last : journal->j_last); + next_commit_ID, next_log_block, journal->j_last); /* Skip over each chunk of the transaction looking * either the next descriptor block or the final commit diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 4d1fda1f7143..5f08b5fd105a 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -935,19 +935,15 @@ static void warn_dirty_buffer(struct buffer_head *bh) /* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */ static void jbd2_freeze_jh_data(struct journal_head *jh) { - struct page *page; - int offset; char *source; struct buffer_head *bh = jh2bh(jh); J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n"); - page = bh->b_page; - offset = offset_in_page(bh->b_data); - source = kmap_atomic(page); + source = kmap_local_folio(bh->b_folio, bh_offset(bh)); /* Fire data frozen trigger just before we copy the data */ - jbd2_buffer_frozen_trigger(jh, source + offset, jh->b_triggers); - memcpy(jh->b_frozen_data, source + offset, bh->b_size); - kunmap_atomic(source); + jbd2_buffer_frozen_trigger(jh, source, jh->b_triggers); + memcpy(jh->b_frozen_data, source, bh->b_size); + kunmap_local(source); /* * Now that the frozen data is saved off, we need to store any matching diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 5075a0a6d594..091ab0eaabbe 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -204,7 +204,8 @@ static int jffs2_create(struct mnt_idmap *idmap, struct inode *dir_i, if (ret) goto fail; - dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime)); + dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, + ITIME(je32_to_cpu(ri->ctime))); jffs2_free_raw_inode(ri); @@ -237,7 +238,7 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry) if (dead_f->inocache) set_nlink(d_inode(dentry), dead_f->inocache->pino_nlink); if (!ret) - dir_i->i_mtime = dir_i->i_ctime = ITIME(now); + dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now)); return ret; } /***********************************************************************/ @@ -271,7 +272,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de set_nlink(d_inode(old_dentry), ++f->inocache->pino_nlink); mutex_unlock(&f->sem); d_instantiate(dentry, d_inode(old_dentry)); - dir_i->i_mtime = dir_i->i_ctime = ITIME(now); + dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now)); ihold(d_inode(old_dentry)); } return ret; @@ -422,7 +423,8 @@ static int jffs2_symlink (struct mnt_idmap *idmap, struct inode *dir_i, goto fail; } - dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); + dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, + ITIME(je32_to_cpu(rd->mctime))); jffs2_free_raw_dirent(rd); @@ -566,7 +568,8 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, goto fail; } - dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); + dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, + ITIME(je32_to_cpu(rd->mctime))); inc_nlink(dir_i); jffs2_free_raw_dirent(rd); @@ -607,7 +610,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name, dentry->d_name.len, f, now); if (!ret) { - dir_i->i_mtime = dir_i->i_ctime = ITIME(now); + dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now)); clear_nlink(d_inode(dentry)); drop_nlink(dir_i); } @@ -743,7 +746,8 @@ static int jffs2_mknod (struct mnt_idmap *idmap, struct inode *dir_i, goto fail; } - dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); + dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, + ITIME(je32_to_cpu(rd->mctime))); jffs2_free_raw_dirent(rd); @@ -864,14 +868,16 @@ static int jffs2_rename (struct mnt_idmap *idmap, * caller won't do it on its own since we are returning an error. */ d_invalidate(new_dentry); - new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now); + new_dir_i->i_mtime = inode_set_ctime_to_ts(new_dir_i, + ITIME(now)); return ret; } if (d_is_dir(old_dentry)) drop_nlink(old_dir_i); - new_dir_i->i_mtime = new_dir_i->i_ctime = old_dir_i->i_mtime = old_dir_i->i_ctime = ITIME(now); + old_dir_i->i_mtime = inode_set_ctime_to_ts(old_dir_i, ITIME(now)); + new_dir_i->i_mtime = inode_set_ctime_to_ts(new_dir_i, ITIME(now)); return 0; } diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 2345ca3f09ee..11c66793960e 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -317,7 +317,8 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, inode->i_size = pos + writtenlen; inode->i_blocks = (inode->i_size + 511) >> 9; - inode->i_ctime = inode->i_mtime = ITIME(je32_to_cpu(ri->ctime)); + inode->i_mtime = inode_set_ctime_to_ts(inode, + ITIME(je32_to_cpu(ri->ctime))); } } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 038516bee1ab..0403efab4089 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -115,7 +115,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) ri->isize = cpu_to_je32((ivalid & ATTR_SIZE)?iattr->ia_size:inode->i_size); ri->atime = cpu_to_je32(I_SEC((ivalid & ATTR_ATIME)?iattr->ia_atime:inode->i_atime)); ri->mtime = cpu_to_je32(I_SEC((ivalid & ATTR_MTIME)?iattr->ia_mtime:inode->i_mtime)); - ri->ctime = cpu_to_je32(I_SEC((ivalid & ATTR_CTIME)?iattr->ia_ctime:inode->i_ctime)); + ri->ctime = cpu_to_je32(I_SEC((ivalid & ATTR_CTIME)?iattr->ia_ctime:inode_get_ctime(inode))); ri->offset = cpu_to_je32(0); ri->csize = ri->dsize = cpu_to_je32(mdatalen); @@ -148,7 +148,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) } /* It worked. Update the inode */ inode->i_atime = ITIME(je32_to_cpu(ri->atime)); - inode->i_ctime = ITIME(je32_to_cpu(ri->ctime)); + inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(ri->ctime))); inode->i_mtime = ITIME(je32_to_cpu(ri->mtime)); inode->i_mode = jemode_to_cpu(ri->mode); i_uid_write(inode, je16_to_cpu(ri->uid)); @@ -284,7 +284,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) inode->i_size = je32_to_cpu(latest_node.isize); inode->i_atime = ITIME(je32_to_cpu(latest_node.atime)); inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime)); - inode->i_ctime = ITIME(je32_to_cpu(latest_node.ctime)); + inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(latest_node.ctime))); set_nlink(inode, f->inocache->pino_nlink); @@ -388,7 +388,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags) iattr.ia_gid = inode->i_gid; iattr.ia_atime = inode->i_atime; iattr.ia_mtime = inode->i_mtime; - iattr.ia_ctime = inode->i_ctime; + iattr.ia_ctime = inode_get_ctime(inode); jffs2_do_setattr(inode, &iattr); } @@ -475,7 +475,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r inode->i_mode = jemode_to_cpu(ri->mode); i_gid_write(inode, je16_to_cpu(ri->gid)); i_uid_write(inode, je16_to_cpu(ri->uid)); - inode->i_atime = inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode->i_mtime)); inode->i_blocks = 0; diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 8da19766c101..50727a1ff931 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -35,7 +35,7 @@ struct kvec; #define ITIME(sec) ((struct timespec64){sec, 0}) #define JFFS2_NOW() JFFS2_CLAMP_TIME(ktime_get_real_seconds()) #define I_SEC(tv) JFFS2_CLAMP_TIME((tv).tv_sec) -#define JFFS2_F_I_CTIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_ctime) +#define JFFS2_F_I_CTIME(f) I_SEC(inode_get_ctime(OFNI_EDONI_2SFFJ(f))) #define JFFS2_F_I_MTIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_mtime) #define JFFS2_F_I_ATIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_atime) #define sleep_on_spinunlock(wq, s) \ diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig index 51e856f0e4b8..3728cf4d944d 100644 --- a/fs/jfs/Kconfig +++ b/fs/jfs/Kconfig @@ -1,7 +1,9 @@ # SPDX-License-Identifier: GPL-2.0-only config JFS_FS tristate "JFS filesystem support" + select BUFFER_HEAD select NLS + select NLS_UCS2_UTILS select CRC32 select LEGACY_DIRECT_IO help diff --git a/fs/jfs/Makefile b/fs/jfs/Makefile index 7156d2c218c7..b769bbf8bdc2 100644 --- a/fs/jfs/Makefile +++ b/fs/jfs/Makefile @@ -9,7 +9,7 @@ jfs-y := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \ jfs_xtree.o jfs_imap.o jfs_debug.o jfs_dmap.o \ jfs_unicode.o jfs_dtree.o jfs_inode.o jfs_discard.o \ jfs_extent.o symlink.o jfs_metapage.o \ - jfs_logmgr.o jfs_txnmgr.o jfs_uniupr.o \ + jfs_logmgr.o jfs_txnmgr.o \ resize.o xattr.o ioctl.o jfs-$(CONFIG_JFS_POSIX_ACL) += acl.o diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index fb96f872d207..1de3602c98de 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -116,7 +116,7 @@ int jfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, if (!rc) { if (update_mode) { inode->i_mode = mode; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); } rc = txCommit(tid, 1, &inode, 0); diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 8ac10e396050..920d58a1566b 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -393,7 +393,7 @@ void jfs_truncate_nolock(struct inode *ip, loff_t length) break; } - ip->i_mtime = ip->i_ctime = current_time(ip); + ip->i_mtime = inode_set_ctime_current(ip); mark_inode_dirty(ip); txCommit(tid, 1, &ip, 0); diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index ed7989bc2db1..f7bd7e8f5be4 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -96,7 +96,7 @@ int jfs_fileattr_set(struct mnt_idmap *idmap, jfs_inode->mode2 = flags; jfs_set_inode_flags(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); return 0; diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index a14a0f18a4c4..88afd108c2dd 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -269,6 +269,7 @@ int dbUnmount(struct inode *ipbmap, int mounterror) /* free the memory for the in-memory bmap. */ kfree(bmp); + JFS_SBI(ipbmap->i_sb)->bmap = NULL; return (0); } diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c index ae99a7e232ee..63d21822d309 100644 --- a/fs/jfs/jfs_extent.c +++ b/fs/jfs/jfs_extent.c @@ -166,7 +166,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) /* * COMMIT_SyncList flags an anonymous tlock on page that is on * sync list. - * We need to commit the inode to get the page written disk. + * We need to commit the inode to get the page written to the disk. */ if (test_and_clear_cflag(COMMIT_Synclist,ip)) jfs_commit_inode(ip, 0); @@ -311,6 +311,11 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) * blocks in the map. in that case, we'll start off with the * maximum free. */ + + /* give up if no space left */ + if (bmp->db_maxfreebud == -1) + return -ENOSPC; + max = (s64) 1 << bmp->db_maxfreebud; if (*nblocks >= max && *nblocks > nbperpage) nb = nblks = (max > nbperpage) ? max : nbperpage; diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 390cbfce391f..923a58422c46 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -193,6 +193,7 @@ int diUnmount(struct inode *ipimap, int mounterror) * free in-memory control structure */ kfree(imap); + JFS_IP(ipimap)->i_imap = NULL; return (0); } @@ -3064,8 +3065,8 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip) ip->i_atime.tv_nsec = le32_to_cpu(dip->di_atime.tv_nsec); ip->i_mtime.tv_sec = le32_to_cpu(dip->di_mtime.tv_sec); ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec); - ip->i_ctime.tv_sec = le32_to_cpu(dip->di_ctime.tv_sec); - ip->i_ctime.tv_nsec = le32_to_cpu(dip->di_ctime.tv_nsec); + inode_set_ctime(ip, le32_to_cpu(dip->di_ctime.tv_sec), + le32_to_cpu(dip->di_ctime.tv_nsec)); ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks)); ip->i_generation = le32_to_cpu(dip->di_gen); @@ -3139,8 +3140,8 @@ static void copy_to_dinode(struct dinode * dip, struct inode *ip) dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime.tv_sec); dip->di_atime.tv_nsec = cpu_to_le32(ip->i_atime.tv_nsec); - dip->di_ctime.tv_sec = cpu_to_le32(ip->i_ctime.tv_sec); - dip->di_ctime.tv_nsec = cpu_to_le32(ip->i_ctime.tv_nsec); + dip->di_ctime.tv_sec = cpu_to_le32(inode_get_ctime(ip).tv_sec); + dip->di_ctime.tv_nsec = cpu_to_le32(inode_get_ctime(ip).tv_nsec); dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime.tv_sec); dip->di_mtime.tv_nsec = cpu_to_le32(ip->i_mtime.tv_nsec); dip->di_ixpxd = jfs_ip->ixpxd; /* in-memory pxd's are little-endian */ diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 9e1f02767201..87594efa7f7c 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -97,8 +97,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode) jfs_inode->mode2 |= inode->i_mode; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - jfs_inode->otime = inode->i_ctime.tv_sec; + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + jfs_inode->otime = inode_get_ctime(inode).tv_sec; inode->i_generation = JFS_SBI(sb)->gengen++; jfs_inode->cflag = 0; diff --git a/fs/jfs/jfs_unicode.h b/fs/jfs/jfs_unicode.h index 9db62d047daa..b6a78d4aef1b 100644 --- a/fs/jfs/jfs_unicode.h +++ b/fs/jfs/jfs_unicode.h @@ -8,16 +8,9 @@ #include <linux/slab.h> #include <asm/byteorder.h> +#include "../nls/nls_ucs2_data.h" #include "jfs_types.h" -typedef struct { - wchar_t start; - wchar_t end; - signed char *table; -} UNICASERANGE; - -extern signed char UniUpperTable[512]; -extern UNICASERANGE UniUpperRange[]; extern int get_UCSname(struct component_name *, struct dentry *); extern int jfs_strfromUCS_le(char *, const __le16 *, int, struct nls_table *); @@ -107,12 +100,12 @@ static inline wchar_t *UniStrncpy_from_le(wchar_t * ucs1, const __le16 * ucs2, */ static inline wchar_t UniToupper(wchar_t uc) { - UNICASERANGE *rp; + const struct UniCaseRange *rp; - if (uc < sizeof(UniUpperTable)) { /* Latin characters */ - return uc + UniUpperTable[uc]; /* Use base tables */ + if (uc < sizeof(NlsUniUpperTable)) { /* Latin characters */ + return uc + NlsUniUpperTable[uc]; /* Use base tables */ } else { - rp = UniUpperRange; /* Use range tables */ + rp = NlsUniUpperRange; /* Use range tables */ while (rp->start) { if (uc < rp->start) /* Before start of range */ return uc; /* Uppercase = input */ diff --git a/fs/jfs/jfs_uniupr.c b/fs/jfs/jfs_uniupr.c deleted file mode 100644 index d0b18c7befb8..000000000000 --- a/fs/jfs/jfs_uniupr.c +++ /dev/null @@ -1,121 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (C) International Business Machines Corp., 2000-2002 - */ - -#include <linux/fs.h> -#include "jfs_unicode.h" - -/* - * Latin upper case - */ -signed char UniUpperTable[512] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 040-04f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 050-05f */ - 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 060-06f */ - -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, 0, 0, 0, 0, 0, /* 070-07f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0c0-0cf */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0d0-0df */ - -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 0e0-0ef */ - -32,-32,-32,-32,-32,-32,-32, 0,-32,-32,-32,-32,-32,-32,-32,121, /* 0f0-0ff */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 100-10f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 110-11f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 120-12f */ - 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 130-13f */ - -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, /* 140-14f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 150-15f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 160-16f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 170-17f */ - 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, /* 180-18f */ - 0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, /* 190-19f */ - 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, /* 1a0-1af */ - -1, 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, /* 1b0-1bf */ - 0, 0, 0, 0, 0, -1, -2, 0, -1, -2, 0, -1, -2, 0, -1, 0, /* 1c0-1cf */ - -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,-79, 0, -1, /* 1d0-1df */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e0-1ef */ - 0, 0, -1, -2, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, /* 1f0-1ff */ -}; - -/* Upper case range - Greek */ -static signed char UniCaseRangeU03a0[47] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-38,-37,-37,-37, /* 3a0-3af */ - 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 3b0-3bf */ - -32,-32,-31,-32,-32,-32,-32,-32,-32,-32,-32,-32,-64,-63,-63, -}; - -/* Upper case range - Cyrillic */ -static signed char UniCaseRangeU0430[48] = { - -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 430-43f */ - -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 440-44f */ - 0,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80, 0,-80,-80, /* 450-45f */ -}; - -/* Upper case range - Extended cyrillic */ -static signed char UniCaseRangeU0490[61] = { - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 490-49f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4a0-4af */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4b0-4bf */ - 0, 0, -1, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, -}; - -/* Upper case range - Extended latin and greek */ -static signed char UniCaseRangeU1e00[509] = { - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e00-1e0f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e10-1e1f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e20-1e2f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e30-1e3f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e40-1e4f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e50-1e5f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e60-1e6f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e70-1e7f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e80-1e8f */ - 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0,-59, 0, -1, 0, -1, /* 1e90-1e9f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ea0-1eaf */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1eb0-1ebf */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ec0-1ecf */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ed0-1edf */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ee0-1eef */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f00-1f0f */ - 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f10-1f1f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f20-1f2f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f30-1f3f */ - 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f40-1f4f */ - 0, 8, 0, 8, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f50-1f5f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f60-1f6f */ - 74, 74, 86, 86, 86, 86,100,100, 0, 0,112,112,126,126, 0, 0, /* 1f70-1f7f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f80-1f8f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f90-1f9f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fa0-1faf */ - 8, 8, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fb0-1fbf */ - 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fc0-1fcf */ - 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fd0-1fdf */ - 8, 8, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fe0-1fef */ - 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, -}; - -/* Upper case range - Wide latin */ -static signed char UniCaseRangeUff40[27] = { - 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* ff40-ff4f */ - -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, -}; - -/* - * Upper Case Range - */ -UNICASERANGE UniUpperRange[] = { - { 0x03a0, 0x03ce, UniCaseRangeU03a0 }, - { 0x0430, 0x045f, UniCaseRangeU0430 }, - { 0x0490, 0x04cc, UniCaseRangeU0490 }, - { 0x1e00, 0x1ffc, UniCaseRangeU1e00 }, - { 0xff40, 0xff5a, UniCaseRangeUff40 }, - { 0 } -}; diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index e98ddb2b1cf2..57d7a4300210 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -149,7 +149,7 @@ static int jfs_create(struct mnt_idmap *idmap, struct inode *dip, mark_inode_dirty(ip); - dip->i_ctime = dip->i_mtime = current_time(dip); + dip->i_mtime = inode_set_ctime_current(dip); mark_inode_dirty(dip); @@ -284,7 +284,7 @@ static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip, /* update parent directory inode */ inc_nlink(dip); /* for '..' from child directory */ - dip->i_ctime = dip->i_mtime = current_time(dip); + dip->i_mtime = inode_set_ctime_current(dip); mark_inode_dirty(dip); rc = txCommit(tid, 2, &iplist[0], 0); @@ -390,7 +390,7 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) /* update parent directory's link count corresponding * to ".." entry of the target directory deleted */ - dip->i_ctime = dip->i_mtime = current_time(dip); + dip->i_mtime = inode_set_ctime_current(dip); inode_dec_link_count(dip); /* @@ -512,7 +512,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) ASSERT(ip->i_nlink); - ip->i_ctime = dip->i_ctime = dip->i_mtime = current_time(ip); + dip->i_mtime = inode_set_ctime_to_ts(dip, inode_set_ctime_current(ip)); mark_inode_dirty(dip); /* update target's inode */ @@ -827,8 +827,8 @@ static int jfs_link(struct dentry *old_dentry, /* update object inode */ inc_nlink(ip); /* for new link */ - ip->i_ctime = current_time(ip); - dir->i_ctime = dir->i_mtime = current_time(dir); + inode_set_ctime_current(ip); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); ihold(ip); @@ -883,7 +883,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip, struct component_name dname; u32 ssize; /* source pathname size */ struct btstack btstack; - struct inode *ip = d_inode(dentry); + struct inode *ip; s64 xlen = 0; int bmask = 0, xsize; s64 xaddr; @@ -1028,7 +1028,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip, mark_inode_dirty(ip); - dip->i_ctime = dip->i_mtime = current_time(dip); + dip->i_mtime = inode_set_ctime_current(dip); mark_inode_dirty(dip); /* * commit update of parent directory and link object @@ -1205,7 +1205,7 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, tblk->xflag |= COMMIT_DELETE; tblk->u.ip = new_ip; } else { - new_ip->i_ctime = current_time(new_ip); + inode_set_ctime_current(new_ip); mark_inode_dirty(new_ip); } } else { @@ -1268,10 +1268,10 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, /* * Update ctime on changed/moved inodes & mark dirty */ - old_ip->i_ctime = current_time(old_ip); + inode_set_ctime_current(old_ip); mark_inode_dirty(old_ip); - new_dir->i_ctime = new_dir->i_mtime = current_time(new_dir); + new_dir->i_mtime = inode_set_ctime_current(new_dir); mark_inode_dirty(new_dir); /* Build list of inodes modified by this transaction */ @@ -1283,7 +1283,7 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (old_dir != new_dir) { iplist[ipcount++] = new_dir; - old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); + old_dir->i_mtime = inode_set_ctime_current(old_dir); mark_inode_dirty(old_dir); } @@ -1416,7 +1416,7 @@ static int jfs_mknod(struct mnt_idmap *idmap, struct inode *dir, mark_inode_dirty(ip); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); diff --git a/fs/jfs/super.c b/fs/jfs/super.c index d2f82cb7db1b..2e2f7f6d36a0 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -818,7 +818,7 @@ out: } if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); inode_unlock(inode); return len - towrite; diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 931e50018f88..8577ad494e05 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -647,7 +647,7 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf, if (old_blocks) dquot_free_block(inode, old_blocks); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); return 0; } diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c index 5d826274570c..c429c42a6867 100644 --- a/fs/kernel_read_file.c +++ b/fs/kernel_read_file.c @@ -8,16 +8,16 @@ /** * kernel_read_file() - read file contents into a kernel buffer * - * @file file to read from - * @offset where to start reading from (see below). - * @buf pointer to a "void *" buffer for reading into (if + * @file: file to read from + * @offset: where to start reading from (see below). + * @buf: pointer to a "void *" buffer for reading into (if * *@buf is NULL, a buffer will be allocated, and * @buf_size will be ignored) - * @buf_size size of buf, if already allocated. If @buf not + * @buf_size: size of buf, if already allocated. If @buf not * allocated, this is the largest size to allocate. - * @file_size if non-NULL, the full size of @file will be + * @file_size: if non-NULL, the full size of @file will be * written here. - * @id the kernel_read_file_id identifying the type of + * @id: the kernel_read_file_id identifying the type of * file contents being read (for LSMs to examine) * * @offset must be 0 unless both @buf and @file_size are non-NULL diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 5a1a4af9d3d2..8b2bd65d70e7 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -383,9 +383,11 @@ static int kernfs_link_sibling(struct kernfs_node *kn) rb_insert_color(&kn->rb, &kn->parent->dir.children); /* successfully added, account subdir number */ + down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (kernfs_type(kn) == KERNFS_DIR) kn->parent->dir.subdirs++; kernfs_inc_rev(kn->parent); + up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); return 0; } @@ -408,9 +410,11 @@ static bool kernfs_unlink_sibling(struct kernfs_node *kn) if (RB_EMPTY_NODE(&kn->rb)) return false; + down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (kernfs_type(kn) == KERNFS_DIR) kn->parent->dir.subdirs--; kernfs_inc_rev(kn->parent); + up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); rb_erase(&kn->rb, &kn->parent->dir.children); RB_CLEAR_NODE(&kn->rb); @@ -556,7 +560,7 @@ void kernfs_put(struct kernfs_node *kn) kfree_const(kn->name); if (kn->iattr) { - simple_xattrs_free(&kn->iattr->xattrs); + simple_xattrs_free(&kn->iattr->xattrs, NULL); kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } spin_lock(&kernfs_idr_lock); diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index b22b74d1a115..922719a343a7 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -151,8 +151,7 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) static inline void set_default_inode_attr(struct inode *inode, umode_t mode) { inode->i_mode = mode; - inode->i_atime = inode->i_mtime = - inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); } static inline void set_inode_attr(struct inode *inode, @@ -162,7 +161,7 @@ static inline void set_inode_attr(struct inode *inode, inode->i_gid = attrs->ia_gid; inode->i_atime = attrs->ia_atime; inode->i_mtime = attrs->ia_mtime; - inode->i_ctime = attrs->ia_ctime; + inode_set_ctime_to_ts(inode, attrs->ia_ctime); } static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) @@ -191,7 +190,7 @@ int kernfs_iop_getattr(struct mnt_idmap *idmap, down_read(&root->kernfs_iattr_rwsem); kernfs_refresh_inode(kn, inode); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); up_read(&root->kernfs_iattr_rwsem); return 0; @@ -306,11 +305,17 @@ int kernfs_xattr_get(struct kernfs_node *kn, const char *name, int kernfs_xattr_set(struct kernfs_node *kn, const char *name, const void *value, size_t size, int flags) { + struct simple_xattr *old_xattr; struct kernfs_iattrs *attrs = kernfs_iattrs(kn); if (!attrs) return -ENOMEM; - return simple_xattr_set(&attrs->xattrs, name, value, size, flags, NULL); + old_xattr = simple_xattr_set(&attrs->xattrs, name, value, size, flags); + if (IS_ERR(old_xattr)) + return PTR_ERR(old_xattr); + + simple_xattr_free(old_xattr); + return 0; } static int kernfs_vfs_xattr_get(const struct xattr_handler *handler, @@ -342,7 +347,7 @@ static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn, { atomic_t *sz = &kn->iattr->user_xattr_size; atomic_t *nr = &kn->iattr->nr_user_xattrs; - ssize_t removed_size; + struct simple_xattr *old_xattr; int ret; if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) { @@ -355,13 +360,18 @@ static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn, goto dec_size_out; } - ret = simple_xattr_set(xattrs, full_name, value, size, flags, - &removed_size); - - if (!ret && removed_size >= 0) - size = removed_size; - else if (!ret) + old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags); + if (!old_xattr) return 0; + + if (IS_ERR(old_xattr)) { + ret = PTR_ERR(old_xattr); + goto dec_size_out; + } + + ret = 0; + size = old_xattr->size; + simple_xattr_free(old_xattr); dec_size_out: atomic_sub(size, sz); dec_count_out: @@ -376,18 +386,19 @@ static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn, { atomic_t *sz = &kn->iattr->user_xattr_size; atomic_t *nr = &kn->iattr->nr_user_xattrs; - ssize_t removed_size; - int ret; + struct simple_xattr *old_xattr; - ret = simple_xattr_set(xattrs, full_name, value, size, flags, - &removed_size); + old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags); + if (!old_xattr) + return 0; - if (removed_size >= 0) { - atomic_sub(removed_size, sz); - atomic_dec(nr); - } + if (IS_ERR(old_xattr)) + return PTR_ERR(old_xattr); - return ret; + atomic_sub(old_xattr->size, sz); + atomic_dec(nr); + simple_xattr_free(old_xattr); + return 0; } static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index d49606accb07..c4bf26142eec 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -16,6 +16,8 @@ #include <linux/namei.h> #include <linux/seq_file.h> #include <linux/exportfs.h> +#include <linux/uuid.h> +#include <linux/statfs.h> #include "kernfs-internal.h" @@ -45,8 +47,15 @@ static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry) return 0; } +static int kernfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + simple_statfs(dentry, buf); + buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); + return 0; +} + const struct super_operations kernfs_sops = { - .statfs = simple_statfs, + .statfs = kernfs_statfs, .drop_inode = generic_delete_inode, .evict_inode = kernfs_evict_inode, @@ -351,6 +360,8 @@ int kernfs_get_tree(struct fs_context *fc) } sb->s_flags |= SB_ACTIVE; + uuid_gen(&sb->s_uuid); + down_write(&root->kernfs_supers_rwsem); list_add(&info->node, &info->root->supers); up_write(&root->kernfs_supers_rwsem); diff --git a/fs/libfs.c b/fs/libfs.c index 5b851315eeed..37f2d34ee090 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -33,7 +33,7 @@ int simple_getattr(struct mnt_idmap *idmap, const struct path *path, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9); return 0; } @@ -239,6 +239,254 @@ const struct inode_operations simple_dir_inode_operations = { }; EXPORT_SYMBOL(simple_dir_inode_operations); +static void offset_set(struct dentry *dentry, u32 offset) +{ + dentry->d_fsdata = (void *)((uintptr_t)(offset)); +} + +static u32 dentry2offset(struct dentry *dentry) +{ + return (u32)((uintptr_t)(dentry->d_fsdata)); +} + +static struct lock_class_key simple_offset_xa_lock; + +/** + * simple_offset_init - initialize an offset_ctx + * @octx: directory offset map to be initialized + * + */ +void simple_offset_init(struct offset_ctx *octx) +{ + xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1); + lockdep_set_class(&octx->xa.xa_lock, &simple_offset_xa_lock); + + /* 0 is '.', 1 is '..', so always start with offset 2 */ + octx->next_offset = 2; +} + +/** + * simple_offset_add - Add an entry to a directory's offset map + * @octx: directory offset ctx to be updated + * @dentry: new dentry being added + * + * Returns zero on success. @so_ctx and the dentry offset are updated. + * Otherwise, a negative errno value is returned. + */ +int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) +{ + static const struct xa_limit limit = XA_LIMIT(2, U32_MAX); + u32 offset; + int ret; + + if (dentry2offset(dentry) != 0) + return -EBUSY; + + ret = xa_alloc_cyclic(&octx->xa, &offset, dentry, limit, + &octx->next_offset, GFP_KERNEL); + if (ret < 0) + return ret; + + offset_set(dentry, offset); + return 0; +} + +/** + * simple_offset_remove - Remove an entry to a directory's offset map + * @octx: directory offset ctx to be updated + * @dentry: dentry being removed + * + */ +void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry) +{ + u32 offset; + + offset = dentry2offset(dentry); + if (offset == 0) + return; + + xa_erase(&octx->xa, offset); + offset_set(dentry, 0); +} + +/** + * simple_offset_rename_exchange - exchange rename with directory offsets + * @old_dir: parent of dentry being moved + * @old_dentry: dentry being moved + * @new_dir: destination parent + * @new_dentry: destination dentry + * + * Returns zero on success. Otherwise a negative errno is returned and the + * rename is rolled back. + */ +int simple_offset_rename_exchange(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry) +{ + struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); + struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); + u32 old_index = dentry2offset(old_dentry); + u32 new_index = dentry2offset(new_dentry); + int ret; + + simple_offset_remove(old_ctx, old_dentry); + simple_offset_remove(new_ctx, new_dentry); + + ret = simple_offset_add(new_ctx, old_dentry); + if (ret) + goto out_restore; + + ret = simple_offset_add(old_ctx, new_dentry); + if (ret) { + simple_offset_remove(new_ctx, old_dentry); + goto out_restore; + } + + ret = simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); + if (ret) { + simple_offset_remove(new_ctx, old_dentry); + simple_offset_remove(old_ctx, new_dentry); + goto out_restore; + } + return 0; + +out_restore: + offset_set(old_dentry, old_index); + xa_store(&old_ctx->xa, old_index, old_dentry, GFP_KERNEL); + offset_set(new_dentry, new_index); + xa_store(&new_ctx->xa, new_index, new_dentry, GFP_KERNEL); + return ret; +} + +/** + * simple_offset_destroy - Release offset map + * @octx: directory offset ctx that is about to be destroyed + * + * During fs teardown (eg. umount), a directory's offset map might still + * contain entries. xa_destroy() cleans out anything that remains. + */ +void simple_offset_destroy(struct offset_ctx *octx) +{ + xa_destroy(&octx->xa); +} + +/** + * offset_dir_llseek - Advance the read position of a directory descriptor + * @file: an open directory whose position is to be updated + * @offset: a byte offset + * @whence: enumerator describing the starting position for this update + * + * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories. + * + * Returns the updated read position if successful; otherwise a + * negative errno is returned and the read position remains unchanged. + */ +static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) +{ + switch (whence) { + case SEEK_CUR: + offset += file->f_pos; + fallthrough; + case SEEK_SET: + if (offset >= 0) + break; + fallthrough; + default: + return -EINVAL; + } + + return vfs_setpos(file, offset, U32_MAX); +} + +static struct dentry *offset_find_next(struct xa_state *xas) +{ + struct dentry *child, *found = NULL; + + rcu_read_lock(); + child = xas_next_entry(xas, U32_MAX); + if (!child) + goto out; + spin_lock(&child->d_lock); + if (simple_positive(child)) + found = dget_dlock(child); + spin_unlock(&child->d_lock); +out: + rcu_read_unlock(); + return found; +} + +static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) +{ + u32 offset = dentry2offset(dentry); + struct inode *inode = d_inode(dentry); + + return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset, + inode->i_ino, fs_umode_to_dtype(inode->i_mode)); +} + +static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx) +{ + struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode); + XA_STATE(xas, &so_ctx->xa, ctx->pos); + struct dentry *dentry; + + while (true) { + dentry = offset_find_next(&xas); + if (!dentry) + break; + + if (!offset_dir_emit(ctx, dentry)) { + dput(dentry); + break; + } + + dput(dentry); + ctx->pos = xas.xa_index + 1; + } +} + +/** + * offset_readdir - Emit entries starting at offset @ctx->pos + * @file: an open directory to iterate over + * @ctx: directory iteration context + * + * Caller must hold @file's i_rwsem to prevent insertion or removal of + * entries during this call. + * + * On entry, @ctx->pos contains an offset that represents the first entry + * to be read from the directory. + * + * The operation continues until there are no more entries to read, or + * until the ctx->actor indicates there is no more space in the caller's + * output buffer. + * + * On return, @ctx->pos contains an offset that will read the next entry + * in this directory when offset_readdir() is called again with @ctx. + * + * Return values: + * %0 - Complete + */ +static int offset_readdir(struct file *file, struct dir_context *ctx) +{ + struct dentry *dir = file->f_path.dentry; + + lockdep_assert_held(&d_inode(dir)->i_rwsem); + + if (!dir_emit_dots(file, ctx)) + return 0; + + offset_iterate_dir(d_inode(dir), ctx); + return 0; +} + +const struct file_operations simple_offset_dir_operations = { + .llseek = offset_dir_llseek, + .iterate_shared = offset_readdir, + .read = generic_read_dir, + .fsync = noop_fsync, +}; + static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev) { struct dentry *child = NULL; @@ -275,7 +523,7 @@ void simple_recursive_removal(struct dentry *dentry, while ((child = find_next_child(this, victim)) == NULL) { // kill and ascend // update metadata while it's still locked - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); clear_nlink(inode); inode_unlock(inode); victim = this; @@ -293,8 +541,7 @@ void simple_recursive_removal(struct dentry *dentry, dput(victim); // unpin it } if (victim == dentry) { - inode->i_ctime = inode->i_mtime = - current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); if (d_is_dir(dentry)) drop_nlink(inode); inode_unlock(inode); @@ -335,7 +582,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc) */ root->i_ino = 1; root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; - root->i_atime = root->i_mtime = root->i_ctime = current_time(root); + root->i_atime = root->i_mtime = inode_set_ctime_current(root); s->s_root = d_make_root(root); if (!s->s_root) return -ENOMEM; @@ -391,7 +638,8 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den { struct inode *inode = d_inode(old_dentry); - inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + dir->i_mtime = inode_set_ctime_to_ts(dir, + inode_set_ctime_current(inode)); inc_nlink(inode); ihold(inode); dget(dentry); @@ -425,7 +673,8 @@ int simple_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + dir->i_mtime = inode_set_ctime_to_ts(dir, + inode_set_ctime_current(inode)); drop_nlink(inode); dput(dentry); return 0; @@ -444,6 +693,31 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry) } EXPORT_SYMBOL(simple_rmdir); +/** + * simple_rename_timestamp - update the various inode timestamps for rename + * @old_dir: old parent directory + * @old_dentry: dentry that is being renamed + * @new_dir: new parent directory + * @new_dentry: target for rename + * + * POSIX mandates that the old and new parent directories have their ctime and + * mtime updated, and that inodes of @old_dentry and @new_dentry (if any), have + * their ctime updated. + */ +void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *newino = d_inode(new_dentry); + + old_dir->i_mtime = inode_set_ctime_current(old_dir); + if (new_dir != old_dir) + new_dir->i_mtime = inode_set_ctime_current(new_dir); + inode_set_ctime_current(d_inode(old_dentry)); + if (newino) + inode_set_ctime_current(newino); +} +EXPORT_SYMBOL_GPL(simple_rename_timestamp); + int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { @@ -459,11 +733,7 @@ int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, inc_nlink(old_dir); } } - old_dir->i_ctime = old_dir->i_mtime = - new_dir->i_ctime = new_dir->i_mtime = - d_inode(old_dentry)->i_ctime = - d_inode(new_dentry)->i_ctime = current_time(old_dir); - + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); return 0; } EXPORT_SYMBOL_GPL(simple_rename_exchange); @@ -472,7 +742,6 @@ int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - struct inode *inode = d_inode(old_dentry); int they_are_dirs = d_is_dir(old_dentry); if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) @@ -495,9 +764,7 @@ int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir, inc_nlink(new_dir); } - old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime = - new_dir->i_mtime = inode->i_ctime = current_time(old_dir); - + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); return 0; } EXPORT_SYMBOL(simple_rename); @@ -548,21 +815,20 @@ int simple_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) { - struct page *page; - pgoff_t index; - - index = pos >> PAGE_SHIFT; + struct folio *folio; - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; + folio = __filemap_get_folio(mapping, pos / PAGE_SIZE, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); - *pagep = page; + *pagep = &folio->page; - if (!PageUptodate(page) && (len != PAGE_SIZE)) { - unsigned from = pos & (PAGE_SIZE - 1); + if (!folio_test_uptodate(folio) && (len != folio_size(folio))) { + size_t from = offset_in_folio(folio, pos); - zero_user_segments(page, 0, from, from + len, PAGE_SIZE); + folio_zero_segments(folio, 0, from, + from + len, folio_size(folio)); } return 0; } @@ -594,17 +860,18 @@ static int simple_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { - struct inode *inode = page->mapping->host; + struct folio *folio = page_folio(page); + struct inode *inode = folio->mapping->host; loff_t last_pos = pos + copied; - /* zero the stale part of the page if we did a short copy */ - if (!PageUptodate(page)) { + /* zero the stale part of the folio if we did a short copy */ + if (!folio_test_uptodate(folio)) { if (copied < len) { - unsigned from = pos & (PAGE_SIZE - 1); + size_t from = offset_in_folio(folio, pos); - zero_user(page, from + copied, len - copied); + folio_zero_range(folio, from + copied, len - copied); } - SetPageUptodate(page); + folio_mark_uptodate(folio); } /* * No need to use i_size_read() here, the i_size @@ -613,9 +880,9 @@ static int simple_write_end(struct file *file, struct address_space *mapping, if (last_pos > inode->i_size) i_size_write(inode, last_pos); - set_page_dirty(page); - unlock_page(page); - put_page(page); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); return copied; } @@ -659,7 +926,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic, */ inode->i_ino = 1; inode->i_mode = S_IFDIR | 0755; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; set_nlink(inode, 2); @@ -685,7 +952,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic, goto out; } inode->i_mode = S_IFREG | files->mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); inode->i_fop = files->ops; inode->i_ino = i; d_add(dentry, inode); @@ -1253,7 +1520,7 @@ struct inode *alloc_anon_inode(struct super_block *s) inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_flags |= S_PRIVATE; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); return inode; } EXPORT_SYMBOL(alloc_anon_inode); @@ -1269,7 +1536,7 @@ EXPORT_SYMBOL(alloc_anon_inode); * All arguments are ignored and it just returns -EINVAL. */ int -simple_nosetlease(struct file *filp, long arg, struct file_lock **flp, +simple_nosetlease(struct file *filp, int arg, struct file_lock **flp, void **priv) { return -EINVAL; @@ -1315,7 +1582,7 @@ static int empty_dir_getattr(struct mnt_idmap *idmap, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } @@ -1381,16 +1648,6 @@ bool is_empty_dir_inode(struct inode *inode) } #if IS_ENABLED(CONFIG_UNICODE) -/* - * Determine if the name of a dentry should be casefolded. - * - * Return: if names will need casefolding - */ -static bool needs_casefold(const struct inode *dir) -{ - return IS_CASEFOLDED(dir) && dir->i_sb->s_encoding; -} - /** * generic_ci_d_compare - generic d_compare implementation for casefolding filesystems * @dentry: dentry whose name we are checking against @@ -1411,7 +1668,7 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, char strbuf[DNAME_INLINE_LEN]; int ret; - if (!dir || !needs_casefold(dir)) + if (!dir || !IS_CASEFOLDED(dir)) goto fallback; /* * If the dentry name is stored in-line, then it may be concurrently @@ -1453,7 +1710,7 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) const struct unicode_map *um = sb->s_encoding; int ret = 0; - if (!dir || !needs_casefold(dir)) + if (!dir || !IS_CASEFOLDED(dir)) return 0; ret = utf8_casefold_hash(um, dentry, str); @@ -1646,6 +1903,7 @@ ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, * We don't know how much we wrote, so just return the number of * bytes which were direct-written */ + iocb->ki_pos -= buffered_written; if (direct_written) return direct_written; return err; diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 1d9488cf0534..87a0f207df0b 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -276,6 +276,9 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, { struct nsm_handle *new; + if (!hostname) + return NULL; + new = kzalloc(sizeof(*new) + hostname_len + 1, GFP_KERNEL); if (unlikely(new == NULL)) return NULL; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 22d3ff3818f5..6579948070a4 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -45,7 +45,6 @@ #define NLMDBG_FACILITY NLMDBG_SVC #define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE) -#define ALLOWED_SIGS (sigmask(SIGKILL)) static struct svc_program nlmsvc_program; @@ -57,6 +56,12 @@ static unsigned int nlmsvc_users; static struct svc_serv *nlmsvc_serv; unsigned long nlmsvc_timeout; +static void nlmsvc_request_retry(struct timer_list *tl) +{ + svc_wake_up(nlmsvc_serv); +} +DEFINE_TIMER(nlmsvc_retry, nlmsvc_request_retry); + unsigned int lockd_net_id; /* @@ -111,26 +116,12 @@ static void set_grace_period(struct net *net) schedule_delayed_work(&ln->grace_period_end, grace_period); } -static void restart_grace(void) -{ - if (nlmsvc_ops) { - struct net *net = &init_net; - struct lockd_net *ln = net_generic(net, lockd_net_id); - - cancel_delayed_work_sync(&ln->grace_period_end); - locks_end_grace(&ln->lockd_manager); - nlmsvc_invalidate_all(); - set_grace_period(net); - } -} - /* * This is the lockd kernel thread */ static int lockd(void *vrqstp) { - int err = 0; struct svc_rqst *rqstp = vrqstp; struct net *net = &init_net; struct lockd_net *ln = net_generic(net, lockd_net_id); @@ -138,9 +129,6 @@ lockd(void *vrqstp) /* try_to_freeze() is called from svc_recv() */ set_freezable(); - /* Allow SIGKILL to tell lockd to drop all of its locks */ - allow_signal(SIGKILL); - dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n"); /* @@ -148,33 +136,12 @@ lockd(void *vrqstp) * NFS mount or NFS daemon has gone away. */ while (!kthread_should_stop()) { - long timeout = MAX_SCHEDULE_TIMEOUT; - RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); - /* update sv_maxconn if it has changed */ rqstp->rq_server->sv_maxconn = nlm_max_connections; - if (signalled()) { - flush_signals(current); - restart_grace(); - continue; - } - - timeout = nlmsvc_retry_blocked(); - - /* - * Find a socket with data available and call its - * recvfrom routine. - */ - err = svc_recv(rqstp, timeout); - if (err == -EAGAIN || err == -EINTR) - continue; - dprintk("lockd: request from %s\n", - svc_print_addr(rqstp, buf, sizeof(buf))); - - svc_process(rqstp); + nlmsvc_retry_blocked(); + svc_recv(rqstp); } - flush_signals(current); if (nlmsvc_ops) nlmsvc_invalidate_all(); nlm_shutdown_hosts(); @@ -407,6 +374,7 @@ static void lockd_put(void) #endif svc_set_num_threads(nlmsvc_serv, NULL, 0); + timer_delete_sync(&nlmsvc_retry); nlmsvc_serv = NULL; dprintk("lockd_down: service destroyed\n"); } @@ -538,7 +506,7 @@ static inline int is_callback(u32 proc) } -static int lockd_authenticate(struct svc_rqst *rqstp) +static enum svc_auth_status lockd_authenticate(struct svc_rqst *rqstp) { rqstp->rq_client = NULL; switch (rqstp->rq_authop->flavour) { diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index c43ccdf28ed9..43aeba9de55c 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -131,12 +131,14 @@ static void nlmsvc_insert_block(struct nlm_block *block, unsigned long when) static inline void nlmsvc_remove_block(struct nlm_block *block) { + spin_lock(&nlm_blocked_lock); if (!list_empty(&block->b_list)) { - spin_lock(&nlm_blocked_lock); list_del_init(&block->b_list); spin_unlock(&nlm_blocked_lock); nlmsvc_release_block(block); + return; } + spin_unlock(&nlm_blocked_lock); } /* @@ -152,6 +154,7 @@ nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock) file, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end, lock->fl.fl_type); + spin_lock(&nlm_blocked_lock); list_for_each_entry(block, &nlm_blocked, b_list) { fl = &block->b_call->a_args.lock.fl; dprintk("lockd: check f=%p pd=%d %Ld-%Ld ty=%d cookie=%s\n", @@ -161,9 +164,11 @@ nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock) nlmdbg_cookie2a(&block->b_call->a_args.cookie)); if (block->b_file == file && nlm_compare_locks(fl, &lock->fl)) { kref_get(&block->b_count); + spin_unlock(&nlm_blocked_lock); return block; } } + spin_unlock(&nlm_blocked_lock); return NULL; } @@ -185,16 +190,19 @@ nlmsvc_find_block(struct nlm_cookie *cookie) { struct nlm_block *block; + spin_lock(&nlm_blocked_lock); list_for_each_entry(block, &nlm_blocked, b_list) { if (nlm_cookie_match(&block->b_call->a_args.cookie,cookie)) goto found; } + spin_unlock(&nlm_blocked_lock); return NULL; found: dprintk("nlmsvc_find_block(%s): block=%p\n", nlmdbg_cookie2a(cookie), block); kref_get(&block->b_count); + spin_unlock(&nlm_blocked_lock); return block; } @@ -317,6 +325,7 @@ void nlmsvc_traverse_blocks(struct nlm_host *host, restart: mutex_lock(&file->f_mutex); + spin_lock(&nlm_blocked_lock); list_for_each_entry_safe(block, next, &file->f_blocks, b_flist) { if (!match(block->b_host, host)) continue; @@ -325,11 +334,13 @@ restart: if (list_empty(&block->b_list)) continue; kref_get(&block->b_count); + spin_unlock(&nlm_blocked_lock); mutex_unlock(&file->f_mutex); nlmsvc_unlink_block(block); nlmsvc_release_block(block); goto restart; } + spin_unlock(&nlm_blocked_lock); mutex_unlock(&file->f_mutex); } @@ -1008,7 +1019,7 @@ retry_deferred_block(struct nlm_block *block) * picks up locks that can be granted, or grant notifications that must * be retransmitted. */ -unsigned long +void nlmsvc_retry_blocked(void) { unsigned long timeout = MAX_SCHEDULE_TIMEOUT; @@ -1038,5 +1049,6 @@ nlmsvc_retry_blocked(void) } spin_unlock(&nlm_blocked_lock); - return timeout; + if (timeout < MAX_SCHEDULE_TIMEOUT) + mod_timer(&nlmsvc_retry, jiffies + timeout); } diff --git a/fs/locks.c b/fs/locks.c index df8b26a42524..76ad05f8070a 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -438,7 +438,7 @@ static void flock_make_lock(struct file *filp, struct file_lock *fl, int type) fl->fl_end = OFFSET_MAX; } -static int assign_type(struct file_lock *fl, long type) +static int assign_type(struct file_lock *fl, int type) { switch (type) { case F_RDLCK: @@ -549,7 +549,7 @@ static const struct lock_manager_operations lease_manager_ops = { /* * Initialize a lease, use the default lock manager operations */ -static int lease_init(struct file *filp, long type, struct file_lock *fl) +static int lease_init(struct file *filp, int type, struct file_lock *fl) { if (assign_type(fl, type) != 0) return -EINVAL; @@ -567,7 +567,7 @@ static int lease_init(struct file *filp, long type, struct file_lock *fl) } /* Allocate a file_lock initialised to this type of lease */ -static struct file_lock *lease_alloc(struct file *filp, long type) +static struct file_lock *lease_alloc(struct file *filp, int type) { struct file_lock *fl = locks_alloc_lock(); int error = -ENOMEM; @@ -868,6 +868,21 @@ static bool posix_locks_conflict(struct file_lock *caller_fl, return locks_conflict(caller_fl, sys_fl); } +/* Determine if lock sys_fl blocks lock caller_fl. Used on xx_GETLK + * path so checks for additional GETLK-specific things like F_UNLCK. + */ +static bool posix_test_locks_conflict(struct file_lock *caller_fl, + struct file_lock *sys_fl) +{ + /* F_UNLCK checks any locks on the same fd. */ + if (caller_fl->fl_type == F_UNLCK) { + if (!posix_same_owner(caller_fl, sys_fl)) + return false; + return locks_overlap(caller_fl, sys_fl); + } + return posix_locks_conflict(caller_fl, sys_fl); +} + /* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific * checking before calling the locks_conflict(). */ @@ -901,7 +916,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) retry: spin_lock(&ctx->flc_lock); list_for_each_entry(cfl, &ctx->flc_posix, fl_list) { - if (!posix_locks_conflict(fl, cfl)) + if (!posix_test_locks_conflict(fl, cfl)) continue; if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable && (*cfl->fl_lmops->lm_lock_expirable)(cfl)) { @@ -1301,6 +1316,7 @@ retry: out: spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); + trace_posix_lock_inode(inode, request, error); /* * Free any unused locks. */ @@ -1309,7 +1325,6 @@ retry: if (new_fl2) locks_free_lock(new_fl2); locks_dispose_list(&dispose); - trace_posix_lock_inode(inode, request, error); return error; } @@ -1666,7 +1681,7 @@ int fcntl_getlease(struct file *filp) * conflict with the lease we're trying to set. */ static int -check_conflicting_open(struct file *filp, const long arg, int flags) +check_conflicting_open(struct file *filp, const int arg, int flags) { struct inode *inode = file_inode(filp); int self_wcount = 0, self_rcount = 0; @@ -1701,7 +1716,7 @@ check_conflicting_open(struct file *filp, const long arg, int flags) } static int -generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv) +generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **priv) { struct file_lock *fl, *my_fl = NULL, *lease; struct inode *inode = file_inode(filp); @@ -1729,13 +1744,6 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr if (is_deleg && !inode_trylock(inode)) return -EAGAIN; - if (is_deleg && arg == F_WRLCK) { - /* Write delegations are not currently supported: */ - inode_unlock(inode); - WARN_ON_ONCE(1); - return -EINVAL; - } - percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); @@ -1859,7 +1867,7 @@ static int generic_delete_lease(struct file *filp, void *owner) * The (input) flp->fl_lmops->lm_break function is required * by break_lease(). */ -int generic_setlease(struct file *filp, long arg, struct file_lock **flp, +int generic_setlease(struct file *filp, int arg, struct file_lock **flp, void **priv) { struct inode *inode = file_inode(filp); @@ -1906,7 +1914,7 @@ lease_notifier_chain_init(void) } static inline void -setlease_notifier(long arg, struct file_lock *lease) +setlease_notifier(int arg, struct file_lock *lease) { if (arg != F_UNLCK) srcu_notifier_call_chain(&lease_notifier_chain, arg, lease); @@ -1942,7 +1950,7 @@ EXPORT_SYMBOL_GPL(lease_unregister_notifier); * may be NULL if the lm_setup operation doesn't require it. */ int -vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv) +vfs_setlease(struct file *filp, int arg, struct file_lock **lease, void **priv) { if (lease) setlease_notifier(arg, *lease); @@ -1953,7 +1961,7 @@ vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv) } EXPORT_SYMBOL_GPL(vfs_setlease); -static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) +static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg) { struct file_lock *fl; struct fasync_struct *new; @@ -1988,7 +1996,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) * Note that you also need to call %F_SETSIG to * receive a signal when the lease is broken. */ -int fcntl_setlease(unsigned int fd, struct file *filp, long arg) +int fcntl_setlease(unsigned int fd, struct file *filp, int arg) { if (arg == F_UNLCK) return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp); @@ -2136,7 +2144,7 @@ EXPORT_SYMBOL_GPL(vfs_test_lock); * @fl: The file_lock who's fl_pid should be translated * @ns: The namespace into which the pid should be translated * - * Used to tranlate a fl_pid into a namespace virtual pid number + * Used to translate a fl_pid into a namespace virtual pid number */ static pid_t locks_translate_pid(struct file_lock *fl, struct pid_namespace *ns) { @@ -2207,7 +2215,8 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock) if (fl == NULL) return -ENOMEM; error = -EINVAL; - if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK) + if (cmd != F_OFD_GETLK && flock->l_type != F_RDLCK + && flock->l_type != F_WRLCK) goto out; error = flock_to_posix_lock(filp, fl, flock); @@ -2414,7 +2423,8 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock) return -ENOMEM; error = -EINVAL; - if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK) + if (cmd != F_OFD_GETLK && flock->l_type != F_RDLCK + && flock->l_type != F_WRLCK) goto out; error = flock64_to_posix_lock(filp, fl, flock); diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig index de2003974ff0..90ddfad2a75e 100644 --- a/fs/minix/Kconfig +++ b/fs/minix/Kconfig @@ -2,6 +2,7 @@ config MINIX_FS tristate "Minix file system support" depends on BLOCK + select BUFFER_HEAD help Minix is a simple operating system used in many classes about OS's. The minix file system (method to organize files on a hard disk diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 870207ba23f1..25c08fbfcb9d 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -251,7 +251,7 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode) } inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = j; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_blocks = 0; memset(&minix_i(inode)->u, 0, sizeof(minix_i(inode)->u)); insert_inode_hash(inode); diff --git a/fs/minix/dir.c b/fs/minix/dir.c index bf9858f76b6a..20f23e6e58ad 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -281,7 +281,7 @@ got_it: de->inode = inode->i_ino; } dir_commit_chunk(page, pos, sbi->s_dirsize); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); err = minix_handle_dirsync(dir); out_put: @@ -313,7 +313,7 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page) else de->inode = 0; dir_commit_chunk(page, pos, len); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); return minix_handle_dirsync(inode); } @@ -436,7 +436,7 @@ int minix_set_link(struct minix_dir_entry *de, struct page *page, else de->inode = inode->i_ino; dir_commit_chunk(page, pos, sbi->s_dirsize); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); return minix_handle_dirsync(dir); } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index e9fbb5303a22..df575473c1cc 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -501,10 +501,7 @@ static struct inode *V1_minix_iget(struct inode *inode) i_gid_write(inode, raw_inode->i_gid); set_nlink(inode, raw_inode->i_nlinks); inode->i_size = raw_inode->i_size; - inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = raw_inode->i_time; - inode->i_mtime.tv_nsec = 0; - inode->i_atime.tv_nsec = 0; - inode->i_ctime.tv_nsec = 0; + inode->i_mtime = inode->i_atime = inode_set_ctime(inode, raw_inode->i_time, 0); inode->i_blocks = 0; for (i = 0; i < 9; i++) minix_inode->u.i1_data[i] = raw_inode->i_zone[i]; @@ -543,10 +540,9 @@ static struct inode *V2_minix_iget(struct inode *inode) inode->i_size = raw_inode->i_size; inode->i_mtime.tv_sec = raw_inode->i_mtime; inode->i_atime.tv_sec = raw_inode->i_atime; - inode->i_ctime.tv_sec = raw_inode->i_ctime; + inode_set_ctime(inode, raw_inode->i_ctime, 0); inode->i_mtime.tv_nsec = 0; inode->i_atime.tv_nsec = 0; - inode->i_ctime.tv_nsec = 0; inode->i_blocks = 0; for (i = 0; i < 10; i++) minix_inode->u.i2_data[i] = raw_inode->i_zone[i]; @@ -622,7 +618,7 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode) raw_inode->i_size = inode->i_size; raw_inode->i_mtime = inode->i_mtime.tv_sec; raw_inode->i_atime = inode->i_atime.tv_sec; - raw_inode->i_ctime = inode->i_ctime.tv_sec; + raw_inode->i_ctime = inode_get_ctime(inode).tv_sec; if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev); else for (i = 0; i < 10; i++) @@ -660,7 +656,7 @@ int minix_getattr(struct mnt_idmap *idmap, const struct path *path, struct super_block *sb = path->dentry->d_sb; struct inode *inode = d_inode(path->dentry); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (INODE_VERSION(inode) == MINIX_V1) stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb); else diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c index 446148792f41..ce18ae37c29d 100644 --- a/fs/minix/itree_common.c +++ b/fs/minix/itree_common.c @@ -131,7 +131,7 @@ static inline int splice_branch(struct inode *inode, /* We are done with atomic stuff, now do the rest of housekeeping */ - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); /* had we spliced it onto indirect block? */ if (where->bh) @@ -350,7 +350,7 @@ do_indirects: } first_whole++; } - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); } diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 956d5183828d..114084d5636a 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -98,7 +98,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir, { struct inode *inode = d_inode(old_dentry); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_link_count(inode); ihold(inode); return add_nondir(dentry, inode); @@ -154,7 +154,7 @@ static int minix_unlink(struct inode * dir, struct dentry *dentry) if (err) return err; - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); return 0; } @@ -218,7 +218,7 @@ static int minix_rename(struct mnt_idmap *idmap, put_page(new_page); if (err) goto out_dir; - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); if (dir_de) drop_nlink(new_inode); inode_dec_link_count(new_inode); diff --git a/fs/namei.c b/fs/namei.c index e56ff39a79bc..567ee547492b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -643,6 +643,8 @@ static bool nd_alloc_stack(struct nameidata *nd) /** * path_connected - Verify that a dentry is below mnt.mnt_root + * @mnt: The mountpoint to check. + * @dentry: The dentry to check. * * Rename can sometimes move a file or directory outside of a bind * mount, path_connected allows those cases to be detected. @@ -1083,6 +1085,7 @@ fs_initcall(init_fs_namei_sysctls); /** * may_follow_link - Check symlink following for unsafe situations * @nd: nameidata pathwalk data + * @inode: Used for idmapping. * * In the case of the sysctl_protected_symlinks sysctl being enabled, * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is @@ -2890,7 +2893,7 @@ int path_pts(struct path *path) dput(path->dentry); path->dentry = parent; child = d_hash_and_lookup(parent, &this); - if (!child) + if (IS_ERR_OR_NULL(child)) return -ENOENT; path->dentry = child; diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 3404707ddbe7..2cd3ccf4c439 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -47,12 +47,14 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) xas_for_each(&xas, folio, last_page) { loff_t pg_end; bool pg_failed = false; + bool folio_started; if (xas_retry(&xas, folio)) continue; pg_end = folio_pos(folio) + folio_size(folio) - 1; + folio_started = false; for (;;) { loff_t sreq_end; @@ -60,8 +62,10 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) pg_failed = true; break; } - if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) + if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { folio_start_fscache(folio); + folio_started = true; + } pg_failed |= subreq_failed; sreq_end = subreq->start + subreq->len - 1; if (pg_end < sreq_end) diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index b6fc169be1b1..7df2503cef6c 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -209,8 +209,6 @@ config NFS_DISABLE_UDP_SUPPORT config NFS_V4_2_READ_PLUS bool "NFS: Enable support for the NFSv4.2 READ_PLUS operation" depends on NFS_V4_2 - default n + default y help - This is intended for developers only. The READ_PLUS operation has - been shown to have issues under specific conditions and should not - be used in production. + Choose Y here to enable use of the NFS v4.2 READ_PLUS operation. diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 70f5563a8e81..65cbb5607a5f 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -404,7 +404,7 @@ bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d, int ret, i; d->children = kcalloc(v->concat.volumes_count, - sizeof(struct pnfs_block_dev), GFP_KERNEL); + sizeof(struct pnfs_block_dev), gfp_mask); if (!d->children) return -ENOMEM; @@ -433,7 +433,7 @@ bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d, int ret, i; d->children = kcalloc(v->stripe.volumes_count, - sizeof(struct pnfs_block_dev), GFP_KERNEL); + sizeof(struct pnfs_block_dev), gfp_mask); if (!d->children) return -ENOMEM; diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 456af7d230cf..466ebf1d41b2 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -74,23 +74,12 @@ out_err: static int nfs4_callback_svc(void *vrqstp) { - int err; struct svc_rqst *rqstp = vrqstp; set_freezable(); - while (!kthread_freezable_should_stop(NULL)) { - - if (signal_pending(current)) - flush_signals(current); - /* - * Listen for a request on the socket - */ - err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT); - if (err == -EAGAIN || err == -EINTR) - continue; - svc_process(rqstp); - } + while (!kthread_freezable_should_stop(NULL)) + svc_recv(rqstp); svc_exit_thread(rqstp); return 0; @@ -112,11 +101,7 @@ nfs41_callback_svc(void *vrqstp) set_freezable(); while (!kthread_freezable_should_stop(NULL)) { - - if (signal_pending(current)) - flush_signals(current); - - prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); + prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_IDLE); spin_lock_bh(&serv->sv_cb_lock); if (!list_empty(&serv->sv_cb_list)) { req = list_first_entry(&serv->sv_cb_list, @@ -387,7 +372,7 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp) * All other checking done after NFS decoding where the nfs_client can be * found in nfs4_callback_compound */ -static int nfs_callback_authenticate(struct svc_rqst *rqstp) +static enum svc_auth_status nfs_callback_authenticate(struct svc_rqst *rqstp) { rqstp->rq_auth_stat = rpc_autherr_badcred; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index c1eda73254e1..6bed1394d748 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -59,7 +59,7 @@ __be32 nfs4_callback_getattr(void *argp, void *resp, res->change_attr = delegation->change_attr; if (nfs_have_writebacks(inode)) res->change_attr++; - res->ctime = inode->i_ctime; + res->ctime = inode_get_ctime(inode); res->mtime = inode->i_mtime; res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) & args->bitmap[0]; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index e4c5f193ed5e..44eca51b2808 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -517,6 +517,8 @@ int nfs_create_rpc_client(struct nfs_client *clp, .authflavor = flavor, .cred = cl_init->cred, .xprtsec = cl_init->xprtsec, + .connect_timeout = cl_init->connect_timeout, + .reconnect_timeout = cl_init->reconnect_timeout, }; if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags)) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 8f3112e71a6a..e6a51fd94fea 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1089,6 +1089,17 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, for (i = desc->cache_entry_index; i < array->size; i++) { struct nfs_cache_array_entry *ent; + /* + * nfs_readdir_handle_cache_misses return force clear at + * (cache_misses > NFS_READDIR_CACHE_MISS_THRESHOLD) for + * readdir heuristic, NFS_READDIR_CACHE_MISS_THRESHOLD + 1 + * entries need be emitted here. + */ + if (first_emit && i > NFS_READDIR_CACHE_MISS_THRESHOLD + 2) { + desc->eob = true; + break; + } + ent = &array->array[i]; if (!dir_emit(desc->ctx, ent->name, ent->name_len, nfs_compat_user_ino64(ent->ino), ent->d_type)) { @@ -1107,10 +1118,6 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, desc->ctx->pos = desc->dir_cookie; else desc->ctx->pos++; - if (first_emit && i > NFS_READDIR_CACHE_MISS_THRESHOLD + 1) { - desc->eob = true; - break; - } } if (array->folio_is_eof) desc->eof = !desc->eob; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 9a18c5a69ace..f6c74f424691 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -93,12 +93,10 @@ nfs_direct_handle_truncated(struct nfs_direct_req *dreq, dreq->max_count = dreq_len; if (dreq->count > dreq_len) dreq->count = dreq_len; - - if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) - dreq->error = hdr->error; - else /* Clear outstanding error if this is EOF */ - dreq->error = 0; } + + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && !dreq->error) + dreq->error = hdr->error; } static void @@ -120,6 +118,18 @@ nfs_direct_count_bytes(struct nfs_direct_req *dreq, dreq->count = dreq_len; } +static void nfs_direct_truncate_request(struct nfs_direct_req *dreq, + struct nfs_page *req) +{ + loff_t offs = req_offset(req); + size_t req_start = (size_t)(offs - dreq->io_start); + + if (req_start < dreq->max_count) + dreq->max_count = req_start; + if (req_start < dreq->count) + dreq->count = req_start; +} + /** * nfs_swap_rw - NFS address space operation for swap I/O * @iocb: target I/O control block @@ -472,21 +482,47 @@ out: return result; } -static void -nfs_direct_join_group(struct list_head *list, struct inode *inode) +static void nfs_direct_add_page_head(struct list_head *list, + struct nfs_page *req) +{ + struct nfs_page *head = req->wb_head; + + if (!list_empty(&head->wb_list) || !nfs_lock_request(head)) + return; + if (!list_empty(&head->wb_list)) { + nfs_unlock_request(head); + return; + } + list_add(&head->wb_list, list); + kref_get(&head->wb_kref); + kref_get(&head->wb_kref); +} + +static void nfs_direct_join_group(struct list_head *list, + struct nfs_commit_info *cinfo, + struct inode *inode) { - struct nfs_page *req, *next; + struct nfs_page *req, *subreq; list_for_each_entry(req, list, wb_list) { - if (req->wb_head != req || req->wb_this_page == req) + if (req->wb_head != req) { + nfs_direct_add_page_head(&req->wb_list, req); continue; - for (next = req->wb_this_page; - next != req->wb_head; - next = next->wb_this_page) { - nfs_list_remove_request(next); - nfs_release_request(next); } - nfs_join_page_group(req, inode); + subreq = req->wb_this_page; + if (subreq == req) + continue; + do { + /* + * Remove subrequests from this list before freeing + * them in the call to nfs_join_page_group(). + */ + if (!list_empty(&subreq->wb_list)) { + nfs_list_remove_request(subreq); + nfs_release_request(subreq); + } + } while ((subreq = subreq->wb_this_page) != req); + nfs_join_page_group(req, cinfo, inode); } } @@ -504,20 +540,15 @@ nfs_direct_write_scan_commit_list(struct inode *inode, static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) { struct nfs_pageio_descriptor desc; - struct nfs_page *req, *tmp; + struct nfs_page *req; LIST_HEAD(reqs); struct nfs_commit_info cinfo; - LIST_HEAD(failed); nfs_init_cinfo_from_dreq(&cinfo, dreq); nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); - nfs_direct_join_group(&reqs, dreq->inode); + nfs_direct_join_group(&reqs, &cinfo, dreq->inode); - dreq->count = 0; - dreq->max_count = 0; - list_for_each_entry(req, &reqs, wb_list) - dreq->max_count += req->wb_bytes; nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo); get_dreq(dreq); @@ -525,27 +556,40 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; - list_for_each_entry_safe(req, tmp, &reqs, wb_list) { + while (!list_empty(&reqs)) { + req = nfs_list_entry(reqs.next); /* Bump the transmission count */ req->wb_nio++; if (!nfs_pageio_add_request(&desc, req)) { - nfs_list_move_request(req, &failed); - spin_lock(&cinfo.inode->i_lock); - dreq->flags = 0; - if (desc.pg_error < 0) + spin_lock(&dreq->lock); + if (dreq->error < 0) { + desc.pg_error = dreq->error; + } else if (desc.pg_error != -EAGAIN) { + dreq->flags = 0; + if (!desc.pg_error) + desc.pg_error = -EIO; dreq->error = desc.pg_error; - else - dreq->error = -EIO; - spin_unlock(&cinfo.inode->i_lock); + } else + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); + break; } nfs_release_request(req); } nfs_pageio_complete(&desc); - while (!list_empty(&failed)) { - req = nfs_list_entry(failed.next); + while (!list_empty(&reqs)) { + req = nfs_list_entry(reqs.next); nfs_list_remove_request(req); nfs_unlock_and_release_request(req); + if (desc.pg_error == -EAGAIN) { + nfs_mark_request_commit(req, NULL, &cinfo, 0); + } else { + spin_lock(&dreq->lock); + nfs_direct_truncate_request(dreq, req); + spin_unlock(&dreq->lock); + nfs_release_request(req); + } } if (put_dreq(dreq)) @@ -565,8 +609,6 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) if (status < 0) { /* Errors in commit are fatal */ dreq->error = status; - dreq->max_count = 0; - dreq->count = 0; dreq->flags = NFS_ODIRECT_DONE; } else { status = dreq->error; @@ -577,7 +619,12 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - if (status >= 0 && !nfs_write_match_verf(verf, req)) { + if (status < 0) { + spin_lock(&dreq->lock); + nfs_direct_truncate_request(dreq, req); + spin_unlock(&dreq->lock); + nfs_release_request(req); + } else if (!nfs_write_match_verf(verf, req)) { dreq->flags = NFS_ODIRECT_RESCHED_WRITES; /* * Despite the reboot, the write was successful, @@ -585,7 +632,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) */ req->wb_nio = 0; nfs_mark_request_commit(req, NULL, &cinfo, 0); - } else /* Error or match */ + } else nfs_release_request(req); nfs_unlock_and_release_request(req); } @@ -638,6 +685,7 @@ static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq) while (!list_empty(&reqs)) { req = nfs_list_entry(reqs.next); nfs_list_remove_request(req); + nfs_direct_truncate_request(dreq, req); nfs_release_request(req); nfs_unlock_and_release_request(req); } @@ -687,7 +735,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) } nfs_direct_count_bytes(dreq, hdr); - if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags)) { + if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags) && + !test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { if (!dreq->flags) dreq->flags = NFS_ODIRECT_DO_COMMIT; flags = dreq->flags; @@ -731,18 +780,23 @@ static void nfs_write_sync_pgio_error(struct list_head *head, int error) static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr) { struct nfs_direct_req *dreq = hdr->dreq; + struct nfs_page *req; + struct nfs_commit_info cinfo; trace_nfs_direct_write_reschedule_io(dreq); + nfs_init_cinfo_from_dreq(&cinfo, dreq); spin_lock(&dreq->lock); - if (dreq->error == 0) { + if (dreq->error == 0) dreq->flags = NFS_ODIRECT_RESCHED_WRITES; - /* fake unstable write to let common nfs resend pages */ - hdr->verf.committed = NFS_UNSTABLE; - hdr->good_bytes = hdr->args.offset + hdr->args.count - - hdr->io_start; - } + set_bit(NFS_IOHDR_REDO, &hdr->flags); spin_unlock(&dreq->lock); + while (!list_empty(&hdr->pages)) { + req = nfs_list_entry(hdr->pages.next); + nfs_list_remove_request(req); + nfs_unlock_request(req); + nfs_mark_request_commit(req, NULL, &cinfo, 0); + } } static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { @@ -770,9 +824,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, { struct nfs_pageio_descriptor desc; struct inode *inode = dreq->inode; + struct nfs_commit_info cinfo; ssize_t result = 0; size_t requested_bytes = 0; size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); + bool defer = false; trace_nfs_direct_write_schedule_iovec(dreq); @@ -813,17 +869,37 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, break; } - nfs_lock_request(req); - if (!nfs_pageio_add_request(&desc, req)) { - result = desc.pg_error; - nfs_unlock_and_release_request(req); - break; - } pgbase = 0; bytes -= req_len; requested_bytes += req_len; pos += req_len; dreq->bytes_left -= req_len; + + if (defer) { + nfs_mark_request_commit(req, NULL, &cinfo, 0); + continue; + } + + nfs_lock_request(req); + if (nfs_pageio_add_request(&desc, req)) + continue; + + /* Exit on hard errors */ + if (desc.pg_error < 0 && desc.pg_error != -EAGAIN) { + result = desc.pg_error; + nfs_unlock_and_release_request(req); + break; + } + + /* If the error is soft, defer remaining requests */ + nfs_init_cinfo_from_dreq(&cinfo, dreq); + spin_lock(&dreq->lock); + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); + nfs_unlock_request(req); + nfs_mark_request_commit(req, NULL, &cinfo, 0); + desc.pg_error = 0; + defer = true; } nfs_direct_release_pages(pagevec, npages); kvfree(pagevec); diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index 6603b5cee029..714975e5c0db 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -7,14 +7,16 @@ * Resolves DNS hostnames into valid ip addresses */ -#ifdef CONFIG_NFS_USE_KERNEL_DNS - #include <linux/module.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/addr.h> -#include <linux/dns_resolver.h> + #include "dns_resolve.h" +#ifdef CONFIG_NFS_USE_KERNEL_DNS + +#include <linux/dns_resolver.h> + ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, struct sockaddr_storage *ss, size_t salen) { @@ -35,7 +37,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, #else -#include <linux/module.h> #include <linux/hash.h> #include <linux/string.h> #include <linux/kmod.h> @@ -43,15 +44,12 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, #include <linux/socket.h> #include <linux/seq_file.h> #include <linux/inet.h> -#include <linux/sunrpc/clnt.h> -#include <linux/sunrpc/addr.h> #include <linux/sunrpc/cache.h> #include <linux/sunrpc/svcauth.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/nfs_fs.h> #include "nfs4_fs.h" -#include "dns_resolve.h" #include "cache_lib.h" #include "netns.h" diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 79b1b3fcd3fc..3f9768810427 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -200,7 +200,7 @@ nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe EXPORT_SYMBOL_GPL(nfs_file_splice_read); int -nfs_file_mmap(struct file * file, struct vm_area_struct * vma) +nfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); int status; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 7deb3cd76abe..a1dc33864906 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1235,6 +1235,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, case -EPFNOSUPPORT: case -EPROTONOSUPPORT: case -EOPNOTSUPP: + case -EINVAL: case -ECONNREFUSED: case -ECONNRESET: case -EHOSTDOWN: diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 8c35d88a84b1..b05717fe0d4e 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -180,6 +180,9 @@ void nfs_fscache_init_inode(struct inode *inode) &auxdata, /* aux_data */ sizeof(auxdata), i_size_read(inode)); + + if (netfs_inode(inode)->cache) + mapping_set_release_always(inode->i_mapping); } /* diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index e1706e736c64..2dc64454492b 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -116,8 +116,8 @@ static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata * memset(auxdata, 0, sizeof(*auxdata)); auxdata->mtime_sec = inode->i_mtime.tv_sec; auxdata->mtime_nsec = inode->i_mtime.tv_nsec; - auxdata->ctime_sec = inode->i_ctime.tv_sec; - auxdata->ctime_nsec = inode->i_ctime.tv_nsec; + auxdata->ctime_sec = inode_get_ctime(inode).tv_sec; + auxdata->ctime_nsec = inode_get_ctime(inode).tv_nsec; if (NFS_SERVER(inode)->nfs_client->rpc_ops->version == 4) auxdata->change_attr = inode_peek_iversion_raw(inode); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 8172dd4135a1..e21c073158e5 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -514,7 +514,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) memset(&inode->i_atime, 0, sizeof(inode->i_atime)); memset(&inode->i_mtime, 0, sizeof(inode->i_mtime)); - memset(&inode->i_ctime, 0, sizeof(inode->i_ctime)); + inode_set_ctime(inode, 0, 0); inode_set_iversion_raw(inode, 0); inode->i_size = 0; clear_nlink(inode); @@ -535,7 +535,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) else if (fattr_supported & NFS_ATTR_FATTR_MTIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); else if (fattr_supported & NFS_ATTR_FATTR_CTIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) @@ -731,7 +731,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); @@ -749,7 +749,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); @@ -765,7 +765,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); @@ -912,7 +912,7 @@ out_no_revalidate: /* Only return attributes that were revalidated. */ stat->result_mask = nfs_get_valid_attrmask(inode) | request_mask; - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); stat->change_cookie = inode_peek_iversion_raw(inode); stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; @@ -1444,11 +1444,11 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR); } /* If we have atomic WCC data, we may update some attributes */ - ts = inode->i_ctime; + ts = inode_get_ctime(inode); if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) && (fattr->valid & NFS_ATTR_FATTR_CTIME) && timespec64_equal(&ts, &fattr->pre_ctime)) { - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); } ts = inode->i_mtime; @@ -1510,7 +1510,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec64_equal(&ts, &fattr->mtime)) invalid |= NFS_INO_INVALID_MTIME; - ts = inode->i_ctime; + ts = inode_get_ctime(inode); if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec64_equal(&ts, &fattr->ctime)) invalid |= NFS_INO_INVALID_CTIME; @@ -1997,7 +1997,7 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa } if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) { - fattr->pre_ctime = inode->i_ctime; + fattr->pre_ctime = inode_get_ctime(inode); fattr->valid |= NFS_ATTR_FATTR_PRECTIME; } if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 && @@ -2190,7 +2190,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) save_cache_validity & NFS_INO_INVALID_MTIME; if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); else if (fattr_supported & NFS_ATTR_FATTR_CTIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_CTIME; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 913c09806c7f..9c9cf764f600 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -82,6 +82,8 @@ struct nfs_client_initdata { const struct rpc_timeout *timeparms; const struct cred *cred; struct xprtsec_parms xprtsec; + unsigned long connect_timeout; + unsigned long reconnect_timeout; }; /* @@ -493,6 +495,7 @@ extern const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops); +extern bool nfs_read_alloc_scratch(struct nfs_pgio_header *hdr, size_t size); extern int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio, struct nfs_open_context *ctx, struct folio *folio); diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 19d51ebf842c..e7494cdd957e 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -215,7 +215,8 @@ nfs_namespace_getattr(struct mnt_idmap *idmap, if (NFS_FH(d_inode(path->dentry))->size != 0) return nfs_getattr(idmap, path, stat, request_mask, query_flags); - generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry), + stat); return 0; } diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 05c3b4b2b3dd..c19093814296 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -949,7 +949,7 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, error = decode_filename_inline(xdr, &entry->name, &entry->len); if (unlikely(error)) - return -EAGAIN; + return error == -ENAMETOOLONG ? -ENAMETOOLONG : -EAGAIN; /* * The type (size and byte order) of nfscookie isn't defined in diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index eff3802c5e03..674c012868b1 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -86,6 +86,7 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans) { struct rpc_timeout ds_timeout; + unsigned long connect_timeout = ds_timeo * (ds_retrans + 1) * HZ / 10; struct nfs_client *mds_clp = mds_srv->nfs_client; struct nfs_client_initdata cl_init = { .addr = ds_addr, @@ -98,6 +99,8 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, .timeparms = &ds_timeout, .cred = mds_srv->cred, .xprtsec = mds_clp->cl_xprtsec, + .connect_timeout = connect_timeout, + .reconnect_timeout = connect_timeout, }; struct nfs_client *clp; char buf[INET6_ADDRSTRLEN + 1]; diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 3b0b650c9c5a..60f032be805a 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1991,7 +1991,7 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, error = decode_inline_filename3(xdr, &entry->name, &entry->len); if (unlikely(error)) - return -EAGAIN; + return error == -ENAMETOOLONG ? -ENAMETOOLONG : -EAGAIN; error = decode_cookie3(xdr, &new_cookie); if (unlikely(error)) diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index 0fe5aacbcfdf..b59876b01a1e 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -13,6 +13,7 @@ * more? Need to consider not to pre-alloc too much for a compound. */ #define PNFS_LAYOUTSTATS_MAXDEV (4) +#define READ_PLUS_SCRATCH_SIZE (16) /* nfs4.2proc.c */ #ifdef CONFIG_NFS_V4_2 diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 63802d195556..063e00aff87e 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -471,8 +471,9 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, continue; } break; - } else if (err == -NFS4ERR_OFFLOAD_NO_REQS && !args.sync) { - args.sync = true; + } else if (err == -NFS4ERR_OFFLOAD_NO_REQS && + args.sync != res.synchronous) { + args.sync = res.synchronous; dst_exception.retry = 1; continue; } else if ((err == -ESTALE || @@ -1377,7 +1378,6 @@ ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name, for (i = 0; i < np; i++) { pages[i] = alloc_page(GFP_KERNEL); if (!pages[i]) { - np = i + 1; err = -ENOMEM; goto out; } @@ -1401,8 +1401,8 @@ ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name, } while (exception.retry); out: - while (--np >= 0) - __free_page(pages[np]); + while (--i >= 0) + __free_page(pages[i]); kfree(pages); return err; diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 95234208dc9e..9e3ae53e2205 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -54,10 +54,16 @@ (1 /* data_content4 */ + \ 2 /* data_info4.di_offset */ + \ 1 /* data_info4.di_length */) +#define NFS42_READ_PLUS_HOLE_SEGMENT_SIZE \ + (1 /* data_content4 */ + \ + 2 /* data_info4.di_offset */ + \ + 2 /* data_info4.di_length */) +#define READ_PLUS_SEGMENT_SIZE_DIFF (NFS42_READ_PLUS_HOLE_SEGMENT_SIZE - \ + NFS42_READ_PLUS_DATA_SEGMENT_SIZE) #define decode_read_plus_maxsz (op_decode_hdr_maxsz + \ 1 /* rpr_eof */ + \ 1 /* rpr_contents count */ + \ - NFS42_READ_PLUS_DATA_SEGMENT_SIZE) + NFS42_READ_PLUS_HOLE_SEGMENT_SIZE) #define encode_seek_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + \ 2 /* offset */ + \ @@ -617,8 +623,8 @@ static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req, encode_putfh(xdr, args->fh, &hdr); encode_read_plus(xdr, args, &hdr); - rpc_prepare_reply_pages(req, args->pages, args->pgbase, - args->count, hdr.replen); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->count, + hdr.replen - READ_PLUS_SEGMENT_SIZE_DIFF); encode_nops(&hdr); } @@ -1056,13 +1062,12 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) res->eof = be32_to_cpup(p++); segments = be32_to_cpup(p++); if (segments == 0) - return status; + return 0; segs = kmalloc_array(segments, sizeof(*segs), GFP_KERNEL); if (!segs) return -ENOMEM; - status = -EIO; for (i = 0; i < segments; i++) { status = decode_read_plus_segment(xdr, &segs[i]); if (status < 0) @@ -1428,7 +1433,7 @@ static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp, struct compound_hdr hdr; int status; - xdr_set_scratch_buffer(xdr, res->scratch, sizeof(res->scratch)); + xdr_set_scratch_buffer(xdr, res->scratch, READ_PLUS_SCRATCH_SIZE); status = decode_compound_hdr(xdr, &hdr); if (status) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 4c9f8bd866ab..47c5c1f86d66 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -328,7 +328,7 @@ extern int update_open_stateid(struct nfs4_state *state, const nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode); -extern int nfs4_proc_setlease(struct file *file, long arg, +extern int nfs4_proc_setlease(struct file *file, int arg, struct file_lock **lease, void **priv); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index d9114a754db7..11e3a285594c 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -232,6 +232,8 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); + if (test_bit(NFS_CS_DS, &cl_init->init_flags)) + __set_bit(NFS_CS_DS, &clp->cl_flags); /* * Set up the connection to the server before we add add to the * global list. @@ -415,6 +417,8 @@ static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old) .net = old->cl_net, .servername = old->cl_hostname, }; + int max_connect = test_bit(NFS_CS_PNFS, &clp->cl_flags) ? + clp->cl_max_connect : old->cl_max_connect; if (clp->cl_proto != old->cl_proto) return; @@ -428,7 +432,7 @@ static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old) xprt_args.addrlen = clp_salen; rpc_clnt_add_xprt(old->cl_rpcclient, &xprt_args, - rpc_clnt_test_and_add_xprt, NULL); + rpc_clnt_test_and_add_xprt, &max_connect); } /** @@ -1007,6 +1011,9 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, if (mds_srv->flags & NFS_MOUNT_NORESVPORT) __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + __set_bit(NFS_CS_DS, &cl_init.init_flags); + __set_bit(NFS_CS_PNFS, &cl_init.init_flags); + cl_init.max_connect = NFS_MAX_TRANSPORTS; /* * Set an authflavor equual to the MDS value. Use the MDS nfs_client * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 4aeadd6e1a6d..02788c3c85e5 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -438,7 +438,7 @@ void nfs42_ssc_unregister_ops(void) } #endif /* CONFIG_NFS_V4_2 */ -static int nfs4_setlease(struct file *file, long arg, struct file_lock **lease, +static int nfs4_setlease(struct file *file, int arg, struct file_lock **lease, void **priv) { return nfs4_proc_setlease(file, arg, lease, priv); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e1a886b58354..3508d8238826 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2703,8 +2703,12 @@ static int _nfs4_proc_open(struct nfs4_opendata *data, return status; } if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) { + struct nfs_fh *fh = &o_res->fh; + nfs4_sequence_free_slot(&o_res->seq_res); - nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, NULL); + if (o_arg->claim == NFS4_OPEN_CLAIM_FH) + fh = NFS_FH(d_inode(data->dentry)); + nfs4_proc_getattr(server, fh, o_res->f_attr, NULL); } return 0; } @@ -5438,18 +5442,8 @@ static bool nfs4_read_plus_not_supported(struct rpc_task *task, return false; } -static inline void nfs4_read_plus_scratch_free(struct nfs_pgio_header *hdr) -{ - if (hdr->res.scratch) { - kfree(hdr->res.scratch); - hdr->res.scratch = NULL; - } -} - static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { - nfs4_read_plus_scratch_free(hdr); - if (!nfs4_sequence_done(task, &hdr->res.seq_res)) return -EAGAIN; if (nfs4_read_stateid_changed(task, &hdr->args)) @@ -5469,8 +5463,7 @@ static bool nfs42_read_plus_support(struct nfs_pgio_header *hdr, /* Note: We don't use READ_PLUS with pNFS yet */ if (nfs_server_capable(hdr->inode, NFS_CAP_READ_PLUS) && !hdr->ds_clp) { msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS]; - hdr->res.scratch = kmalloc(32, GFP_KERNEL); - return hdr->res.scratch != NULL; + return nfs_read_alloc_scratch(hdr, READ_PLUS_SCRATCH_SIZE); } return false; } @@ -6004,9 +5997,8 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, out_ok: ret = res.acl_len; out_free: - for (i = 0; i < npages; i++) - if (pages[i]) - __free_page(pages[i]); + while (--i >= 0) + __free_page(pages[i]); if (res.acl_scratch) __free_page(res.acl_scratch); kfree(pages); @@ -7181,8 +7173,15 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) } else if (!nfs4_update_lock_stateid(lsp, &data->res.stateid)) goto out_restart; break; - case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OLD_STATEID: + if (data->arg.new_lock_owner != 0 && + nfs4_refresh_open_old_stateid(&data->arg.open_stateid, + lsp->ls_state)) + goto out_restart; + if (nfs4_refresh_lock_old_stateid(&data->arg.lock_stateid, lsp)) + goto out_restart; + fallthrough; + case -NFS4ERR_BAD_STATEID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: if (data->arg.new_lock_owner != 0) { @@ -7573,7 +7572,7 @@ static int nfs4_delete_lease(struct file *file, void **priv) return generic_setlease(file, F_UNLCK, NULL, priv); } -static int nfs4_add_lease(struct file *file, long arg, struct file_lock **lease, +static int nfs4_add_lease(struct file *file, int arg, struct file_lock **lease, void **priv) { struct inode *inode = file_inode(file); @@ -7591,7 +7590,7 @@ static int nfs4_add_lease(struct file *file, long arg, struct file_lock **lease, return -EAGAIN; } -int nfs4_proc_setlease(struct file *file, long arg, struct file_lock **lease, +int nfs4_proc_setlease(struct file *file, int arg, struct file_lock **lease, void **priv) { switch (arg) { @@ -8792,6 +8791,8 @@ nfs4_run_exchange_id(struct nfs_client *clp, const struct cred *cred, #ifdef CONFIG_NFS_V4_1_MIGRATION calldata->args.flags |= EXCHGID4_FLAG_SUPP_MOVED_MIGR; #endif + if (test_bit(NFS_CS_DS, &clp->cl_flags)) + calldata->args.flags |= EXCHGID4_FLAG_USE_PNFS_DS; msg.rpc_argp = &calldata->args; msg.rpc_resp = &calldata->res; task_setup_data.callback_data = calldata; @@ -8869,6 +8870,8 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cre /* Save the EXCHANGE_ID verifier session trunk tests */ memcpy(clp->cl_confirm.data, argp->verifier.data, sizeof(clp->cl_confirm.data)); + if (resp->flags & EXCHGID4_FLAG_USE_PNFS_DS) + set_bit(NFS_CS_DS, &clp->cl_flags); out: trace_nfs4_exchange_id(clp, status); rpc_put_task(task); diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index ddbbf4fcda86..178001c90156 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -154,7 +154,7 @@ nfs4_get_device_info(struct nfs_server *server, set_bit(NFS_DEVICEID_NOCACHE, &d->flags); out_free_pages: - for (i = 0; i < max_pages; i++) + while (--i >= 0) __free_page(pages[i]); kfree(pages); out_free_pdev: diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index a0112ad4937a..afd23910f3bf 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -852,6 +852,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, { struct nfs_client *clp = ERR_PTR(-EIO); struct nfs4_pnfs_ds_addr *da; + unsigned long connect_timeout = timeo * (retrans + 1) * HZ / 10; int status = 0; dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr); @@ -870,6 +871,8 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, .dstaddr = (struct sockaddr *)&da->da_addr, .addrlen = da->da_addrlen, .servername = clp->cl_hostname, + .connect_timeout = connect_timeout, + .reconnect_timeout = connect_timeout, }; if (da->da_transport != clp->cl_proto) @@ -943,7 +946,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, * Test this address for session trunking and * add as an alias */ - xprtdata.cred = nfs4_get_clid_cred(clp), + xprtdata.cred = nfs4_get_clid_cred(clp); rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, rpc_clnt_setup_test_and_add_xprt, &rpcdata); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index f71eeee67e20..7dc21a48e3e7 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -47,6 +47,8 @@ static struct nfs_pgio_header *nfs_readhdr_alloc(void) static void nfs_readhdr_free(struct nfs_pgio_header *rhdr) { + if (rhdr->res.scratch != NULL) + kfree(rhdr->res.scratch); kmem_cache_free(nfs_rdata_cachep, rhdr); } @@ -108,6 +110,14 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) } EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); +bool nfs_read_alloc_scratch(struct nfs_pgio_header *hdr, size_t size) +{ + WARN_ON(hdr->res.scratch != NULL); + hdr->res.scratch = kmalloc(size, GFP_KERNEL); + return hdr->res.scratch != NULL; +} +EXPORT_SYMBOL_GPL(nfs_read_alloc_scratch); + static void nfs_readpage_release(struct nfs_page *req, int error) { struct folio *folio = nfs_page_to_folio(req); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2284f749d892..0d6473cb00cb 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1339,15 +1339,13 @@ error_splat_super: void nfs_kill_super(struct super_block *s) { struct nfs_server *server = NFS_SB(s); - dev_t dev = s->s_dev; nfs_sysfs_move_sb_to_server(server); - generic_shutdown_super(s); + kill_anon_super(s); nfs_fscache_release_super_cookie(s); nfs_free_server(server); - free_anon_bdev(dev); } EXPORT_SYMBOL_GPL(nfs_kill_super); diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index acda8f033d30..bf378ecd5d9f 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -345,8 +345,10 @@ void nfs_sysfs_move_sb_to_server(struct nfs_server *server) int ret = -ENOMEM; s = kasprintf(GFP_KERNEL, "server-%d", server->s_sysfs_id); - if (s) + if (s) { ret = kobject_rename(&server->kobj, s); + kfree(s); + } if (ret < 0) pr_warn("NFS: rename sysfs %s failed (%d)\n", server->kobj.name, ret); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f4cca8f00c0c..8c1ee1a1a28f 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -59,7 +59,8 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops; static const struct nfs_commit_completion_ops nfs_commit_completion_ops; static const struct nfs_rw_ops nfs_rw_write_ops; static void nfs_inode_remove_request(struct nfs_page *req); -static void nfs_clear_request_commit(struct nfs_page *req); +static void nfs_clear_request_commit(struct nfs_commit_info *cinfo, + struct nfs_page *req); static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, struct inode *inode); static struct nfs_page * @@ -502,8 +503,8 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, * the (former) group. All subrequests are removed from any write or commit * lists, unlinked from the group and destroyed. */ -void -nfs_join_page_group(struct nfs_page *head, struct inode *inode) +void nfs_join_page_group(struct nfs_page *head, struct nfs_commit_info *cinfo, + struct inode *inode) { struct nfs_page *subreq; struct nfs_page *destroy_list = NULL; @@ -533,7 +534,7 @@ nfs_join_page_group(struct nfs_page *head, struct inode *inode) * Commit list removal accounting is done after locks are dropped */ subreq = head; do { - nfs_clear_request_commit(subreq); + nfs_clear_request_commit(cinfo, subreq); subreq = subreq->wb_this_page; } while (subreq != head); @@ -566,8 +567,10 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) { struct inode *inode = folio_file_mapping(folio)->host; struct nfs_page *head; + struct nfs_commit_info cinfo; int ret; + nfs_init_cinfo_from_inode(&cinfo, inode); /* * A reference is taken only on the head request which acts as a * reference to the whole page group - the group will not be destroyed @@ -584,7 +587,7 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) return ERR_PTR(ret); } - nfs_join_page_group(head, inode); + nfs_join_page_group(head, &cinfo, inode); return head; } @@ -955,18 +958,16 @@ static void nfs_folio_clear_commit(struct folio *folio) } /* Called holding the request lock on @req */ -static void -nfs_clear_request_commit(struct nfs_page *req) +static void nfs_clear_request_commit(struct nfs_commit_info *cinfo, + struct nfs_page *req) { if (test_bit(PG_CLEAN, &req->wb_flags)) { struct nfs_open_context *ctx = nfs_req_openctx(req); struct inode *inode = d_inode(ctx->dentry); - struct nfs_commit_info cinfo; - nfs_init_cinfo_from_inode(&cinfo, inode); mutex_lock(&NFS_I(inode)->commit_mutex); - if (!pnfs_clear_request_commit(req, &cinfo)) { - nfs_request_remove_commit_list(req, &cinfo); + if (!pnfs_clear_request_commit(req, cinfo)) { + nfs_request_remove_commit_list(req, cinfo); } mutex_unlock(&NFS_I(inode)->commit_mutex); nfs_folio_clear_commit(nfs_page_to_folio(req)); diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index 8e9c1a0f8d38..1ed2f691ebb9 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -83,6 +83,15 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, int len = sizeof(__be32), ret, i; __be32 *p; + /* + * See paragraph 5 of RFC 8881 S18.40.3. + */ + if (!gdp->gd_maxcount) { + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) + return nfserr_resource; + return nfs_ok; + } + p = xdr_reserve_space(xdr, len + sizeof(__be32)); if (!p) return nfserr_resource; diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index 4c9b87850ab1..929248c6ca84 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -19,7 +19,7 @@ * typical sockaddr_storage. This is for space reasons, since sockaddr_storage * is much larger than a sockaddr_in6. */ -struct svc_cacherep { +struct nfsd_cacherep { struct { /* Keep often-read xid, csum in the same cache line: */ __be32 k_xid; @@ -84,8 +84,10 @@ int nfsd_net_reply_cache_init(struct nfsd_net *nn); void nfsd_net_reply_cache_destroy(struct nfsd_net *nn); int nfsd_reply_cache_init(struct nfsd_net *); void nfsd_reply_cache_shutdown(struct nfsd_net *); -int nfsd_cache_lookup(struct svc_rqst *); -void nfsd_cache_update(struct svc_rqst *, int, __be32 *); +int nfsd_cache_lookup(struct svc_rqst *rqstp, + struct nfsd_cacherep **cacherep); +void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp, + int cachetype, __be32 *statp); int nfsd_reply_cache_stats_show(struct seq_file *m, void *v); #endif /* NFSCACHE_H */ diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c index e81d2a5cf381..bb205328e043 100644 --- a/fs/nfsd/flexfilelayoutxdr.c +++ b/fs/nfsd/flexfilelayoutxdr.c @@ -85,6 +85,15 @@ nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr, int addr_len; __be32 *p; + /* + * See paragraph 5 of RFC 8881 S18.40.3. + */ + if (!gdp->gd_maxcount) { + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) + return nfserr_resource; + return nfs_ok; + } + /* len + padding for two strings */ addr_len = 16 + da->netaddr.netid_len + da->netaddr.addr_len; ver_len = 20; diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index fc8d5b7db9f8..268ef57751c4 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -307,7 +307,9 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, if (!IS_POSIXACL(inode)) iap->ia_mode &= ~current_umask(); - fh_fill_pre_attrs(fhp); + status = fh_fill_pre_attrs(fhp); + if (status != nfs_ok) + goto out; host_err = vfs_create(&nop_mnt_idmap, inode, child, iap->ia_mode, true); if (host_err < 0) { status = nfserrno(host_err); diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 518203821790..96e786b5e544 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -441,7 +441,7 @@ struct posix_ace_state_array { * calculated so far: */ struct posix_acl_state { - int empty; + unsigned char valid; struct posix_ace_state owner; struct posix_ace_state group; struct posix_ace_state other; @@ -457,7 +457,6 @@ init_state(struct posix_acl_state *state, int cnt) int alloc; memset(state, 0, sizeof(struct posix_acl_state)); - state->empty = 1; /* * In the worst case, each individual acl could be for a distinct * named user or group, but we don't know which, so we allocate @@ -500,7 +499,7 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) * and effective cases: when there are no inheritable ACEs, * calls ->set_acl with a NULL ACL structure. */ - if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) + if (!state->valid && (flags & NFS4_ACL_TYPE_DEFAULT)) return NULL; /* @@ -622,11 +621,12 @@ static void process_one_v4_ace(struct posix_acl_state *state, struct nfs4_ace *ace) { u32 mask = ace->access_mask; + short type = ace2type(ace); int i; - state->empty = 0; + state->valid |= type; - switch (ace2type(ace)) { + switch (type) { case ACL_USER_OBJ: if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { allow_bits(&state->owner, mask); @@ -726,6 +726,30 @@ static int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, if (!(ace->flag & NFS4_ACE_INHERIT_ONLY_ACE)) process_one_v4_ace(&effective_acl_state, ace); } + + /* + * At this point, the default ACL may have zeroed-out entries for owner, + * group and other. That usually results in a non-sensical resulting ACL + * that denies all access except to any ACE that was explicitly added. + * + * The setfacl command solves a similar problem with this logic: + * + * "If a Default ACL entry is created, and the Default ACL contains + * no owner, owning group, or others entry, a copy of the ACL + * owner, owning group, or others entry is added to the Default ACL." + * + * Copy any missing ACEs from the effective set, if any ACEs were + * explicitly set. + */ + if (default_acl_state.valid) { + if (!(default_acl_state.valid & ACL_USER_OBJ)) + default_acl_state.owner = effective_acl_state.owner; + if (!(default_acl_state.valid & ACL_GROUP_OBJ)) + default_acl_state.group = effective_acl_state.group; + if (!(default_acl_state.valid & ACL_OTHER)) + default_acl_state.other = effective_acl_state.other; + } + *pacl = posix_state_to_acl(&effective_acl_state, flags); if (IS_ERR(*pacl)) { ret = PTR_ERR(*pacl); diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 5ae670807449..4199ede0583c 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -297,12 +297,12 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, } if (d_really_is_positive(child)) { - status = nfs_ok; - /* NFSv4 protocol requires change attributes even though * no change happened. */ - fh_fill_both_attrs(fhp); + status = fh_fill_both_attrs(fhp); + if (status != nfs_ok) + goto out; switch (open->op_createmode) { case NFS4_CREATE_UNCHECKED: @@ -345,7 +345,9 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, if (!IS_POSIXACL(inode)) iap->ia_mode &= ~current_umask(); - fh_fill_pre_attrs(fhp); + status = fh_fill_pre_attrs(fhp); + if (status != nfs_ok) + goto out; status = nfsd4_vfs_create(fhp, child, open); if (status != nfs_ok) goto out; @@ -380,6 +382,38 @@ out: return status; } +/** + * set_change_info - set up the change_info4 for a reply + * @cinfo: pointer to nfsd4_change_info to be populated + * @fhp: pointer to svc_fh to use as source + * + * Many operations in NFSv4 require change_info4 in the reply. This function + * populates that from the info that we (should!) have already collected. In + * the event that we didn't get any pre-attrs, just zero out both. + */ +static void +set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) +{ + cinfo->atomic = (u32)(fhp->fh_pre_saved && fhp->fh_post_saved && !fhp->fh_no_atomic_attr); + cinfo->before_change = fhp->fh_pre_change; + cinfo->after_change = fhp->fh_post_change; + + /* + * If fetching the pre-change attributes failed, then we should + * have already failed the whole operation. We could have still + * failed to fetch post-change attributes however. + * + * If we didn't get post-op attrs, just zero-out the after + * field since we don't know what it should be. If the pre_saved + * field isn't set for some reason, throw warning and just copy + * whatever is in the after field. + */ + if (WARN_ON_ONCE(!fhp->fh_pre_saved)) + cinfo->before_change = 0; + if (!fhp->fh_post_saved) + cinfo->after_change = cinfo->before_change + 1; +} + static __be32 do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh) { @@ -424,11 +458,11 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru } else { status = nfsd_lookup(rqstp, current_fh, open->op_fname, open->op_fnamelen, *resfh); - if (!status) + if (status == nfs_ok) /* NFSv4 protocol requires change attributes even though * no change happened. */ - fh_fill_both_attrs(current_fh); + status = fh_fill_both_attrs(current_fh); } if (status) goto out; @@ -1024,8 +1058,8 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, rename->rn_tname, rename->rn_tnamelen); if (status) return status; - set_change_info(&rename->rn_sinfo, &cstate->current_fh); - set_change_info(&rename->rn_tinfo, &cstate->save_fh); + set_change_info(&rename->rn_sinfo, &cstate->save_fh); + set_change_info(&rename->rn_tinfo, &cstate->current_fh); return nfs_ok; } @@ -1313,12 +1347,11 @@ try_again: /* found a match */ if (ni->nsui_busy) { /* wait - and try again */ - prepare_to_wait(&nn->nfsd_ssc_waitq, &wait, - TASK_INTERRUPTIBLE); + prepare_to_wait(&nn->nfsd_ssc_waitq, &wait, TASK_IDLE); spin_unlock(&nn->nfsd_ssc_lock); /* allow 20secs for mount/unmount for now - revisit */ - if (signal_pending(current) || + if (kthread_should_stop() || (schedule_timeout(20*HZ) == 0)) { finish_wait(&nn->nfsd_ssc_waitq, &wait); kfree(work); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3aefbad4cc09..8534693eb6a4 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -649,6 +649,18 @@ find_readable_file(struct nfs4_file *f) return ret; } +static struct nfsd_file * +find_rw_file(struct nfs4_file *f) +{ + struct nfsd_file *ret; + + spin_lock(&f->fi_lock); + ret = nfsd_file_get(f->fi_fds[O_RDWR]); + spin_unlock(&f->fi_lock); + + return ret; +} + struct nfsd_file * find_any_file(struct nfs4_file *f) { @@ -1144,7 +1156,7 @@ static void block_delegations(struct knfsd_fh *fh) static struct nfs4_delegation * alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, - struct nfs4_clnt_odstate *odstate) + struct nfs4_clnt_odstate *odstate, u32 dl_type) { struct nfs4_delegation *dp; long n; @@ -1170,7 +1182,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, INIT_LIST_HEAD(&dp->dl_recall_lru); dp->dl_clnt_odstate = odstate; get_clnt_odstate(odstate); - dp->dl_type = NFS4_OPEN_DELEGATE_READ; + dp->dl_type = dl_type; dp->dl_retries = 1; dp->dl_recalled = false; nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client, @@ -1354,9 +1366,9 @@ static void revoke_delegation(struct nfs4_delegation *dp) trace_nfsd_stid_revoke(&dp->dl_stid); if (clp->cl_minorversion) { + spin_lock(&clp->cl_lock); dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID; refcount_inc(&dp->dl_stid.sc_count); - spin_lock(&clp->cl_lock); list_add(&dp->dl_recall_lru, &clp->cl_revoked); spin_unlock(&clp->cl_lock); } @@ -5449,8 +5461,9 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, struct nfs4_file *fp = stp->st_stid.sc_file; struct nfs4_clnt_odstate *odstate = stp->st_clnt_odstate; struct nfs4_delegation *dp; - struct nfsd_file *nf; + struct nfsd_file *nf = NULL; struct file_lock *fl; + u32 dl_type; /* * The fi_had_conflict and nfs_get_existing_delegation checks @@ -5460,15 +5473,35 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, if (fp->fi_had_conflict) return ERR_PTR(-EAGAIN); - nf = find_readable_file(fp); - if (!nf) { - /* - * We probably could attempt another open and get a read - * delegation, but for now, don't bother until the - * client actually sends us one. - */ - return ERR_PTR(-EAGAIN); + /* + * Try for a write delegation first. RFC8881 section 10.4 says: + * + * "An OPEN_DELEGATE_WRITE delegation allows the client to handle, + * on its own, all opens." + * + * Furthermore the client can use a write delegation for most READ + * operations as well, so we require a O_RDWR file here. + * + * Offer a write delegation in the case of a BOTH open, and ensure + * we get the O_RDWR descriptor. + */ + if ((open->op_share_access & NFS4_SHARE_ACCESS_BOTH) == NFS4_SHARE_ACCESS_BOTH) { + nf = find_rw_file(fp); + dl_type = NFS4_OPEN_DELEGATE_WRITE; } + + /* + * If the file is being opened O_RDONLY or we couldn't get a O_RDWR + * file for some reason, then try for a read delegation instead. + */ + if (!nf && (open->op_share_access & NFS4_SHARE_ACCESS_READ)) { + nf = find_readable_file(fp); + dl_type = NFS4_OPEN_DELEGATE_READ; + } + + if (!nf) + return ERR_PTR(-EAGAIN); + spin_lock(&state_lock); spin_lock(&fp->fi_lock); if (nfs4_delegation_exists(clp, fp)) @@ -5491,11 +5524,11 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, return ERR_PTR(status); status = -ENOMEM; - dp = alloc_init_deleg(clp, fp, odstate); + dp = alloc_init_deleg(clp, fp, odstate, dl_type); if (!dp) goto out_delegees; - fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ); + fl = nfs4_alloc_init_lease(dp, dl_type); if (!fl) goto out_clnt_odstate; @@ -5568,10 +5601,28 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) } /* - * Attempt to hand out a delegation. + * The Linux NFS server does not offer write delegations to NFSv4.0 + * clients in order to avoid conflicts between write delegations and + * GETATTRs requesting CHANGE or SIZE attributes. + * + * With NFSv4.1 and later minorversions, the SEQUENCE operation that + * begins each COMPOUND contains a client ID. Delegation recall can + * be avoided when the server recognizes the client sending a + * GETATTR also holds write delegation it conflicts with. * - * Note we don't support write delegations, and won't until the vfs has - * proper support for them. + * However, the NFSv4.0 protocol does not enable a server to + * determine that a GETATTR originated from the client holding the + * conflicting delegation versus coming from some other client. Per + * RFC 7530 Section 16.7.5, the server must recall or send a + * CB_GETATTR even when the GETATTR originates from the client that + * holds the conflicting delegation. + * + * An NFSv4.0 client can trigger a pathological situation if it + * always sends a DELEGRETURN preceded by a conflicting GETATTR in + * the same COMPOUND. COMPOUND execution will always stop at the + * GETATTR and the DELEGRETURN will never get executed. The server + * eventually revokes the delegation, which can result in loss of + * open or lock state. */ static void nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, @@ -5590,8 +5641,6 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, case NFS4_OPEN_CLAIM_PREVIOUS: if (!cb_up) open->op_recall = 1; - if (open->op_delegate_type != NFS4_OPEN_DELEGATE_READ) - goto out_no_deleg; break; case NFS4_OPEN_CLAIM_NULL: parent = currentfh; @@ -5606,6 +5655,9 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, goto out_no_deleg; if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) goto out_no_deleg; + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE && + !clp->cl_minorversion) + goto out_no_deleg; break; default: goto out_no_deleg; @@ -5616,8 +5668,13 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid)); - trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid); - open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) { + open->op_delegate_type = NFS4_OPEN_DELEGATE_WRITE; + trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid); + } else { + open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; + trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid); + } nfs4_put_stid(&dp->dl_stid); return; out_no_deleg: @@ -8341,3 +8398,68 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, { get_stateid(cstate, &u->write.wr_stateid); } + +/** + * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict + * @rqstp: RPC transaction context + * @inode: file to be checked for a conflict + * + * This function is called when there is a conflict between a write + * delegation and a change/size GETATTR from another client. The server + * must either use the CB_GETATTR to get the current values of the + * attributes from the client that holds the delegation or recall the + * delegation before replying to the GETATTR. See RFC 8881 section + * 18.7.4. + * + * The current implementation does not support CB_GETATTR yet. However + * this can avoid recalling the delegation could be added in follow up + * work. + * + * Returns 0 if there is no conflict; otherwise an nfs_stat + * code is returned. + */ +__be32 +nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode) +{ + __be32 status; + struct file_lock_context *ctx; + struct file_lock *fl; + struct nfs4_delegation *dp; + + ctx = locks_inode_context(inode); + if (!ctx) + return 0; + spin_lock(&ctx->flc_lock); + list_for_each_entry(fl, &ctx->flc_lease, fl_list) { + if (fl->fl_flags == FL_LAYOUT) + continue; + if (fl->fl_lmops != &nfsd_lease_mng_ops) { + /* + * non-nfs lease, if it's a lease with F_RDLCK then + * we are done; there isn't any write delegation + * on this inode + */ + if (fl->fl_type == F_RDLCK) + break; + goto break_lease; + } + if (fl->fl_type == F_WRLCK) { + dp = fl->fl_owner; + if (dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) { + spin_unlock(&ctx->flc_lock); + return 0; + } +break_lease: + spin_unlock(&ctx->flc_lock); + nfsd_stats_wdeleg_getattr_inc(); + status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); + if (status != nfserr_jukebox || + !nfsd_wait_for_delegreturn(rqstp, inode)) + return status; + return 0; + } + break; + } + spin_unlock(&ctx->flc_lock); + return 0; +} diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index b30dca7de8cc..2e40c74d2f72 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2984,6 +2984,11 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, if (status) goto out; } + if (bmval0 & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) { + status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry)); + if (status) + goto out; + } err = vfs_getattr(&path, &stat, STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE, @@ -3973,17 +3978,20 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid); if (nfserr) return nfserr; - p = xdr_reserve_space(xdr, 32); + + p = xdr_reserve_space(xdr, XDR_UNIT * 8); if (!p) return nfserr_resource; *p++ = cpu_to_be32(open->op_recall); /* + * Always flush on close + * * TODO: space_limit's in delegations */ *p++ = cpu_to_be32(NFS4_LIMIT_SIZE); - *p++ = cpu_to_be32(~(u32)0); - *p++ = cpu_to_be32(~(u32)0); + *p++ = xdr_zero; + *p++ = xdr_zero; /* * TODO: ACE's in delegations @@ -4678,20 +4686,17 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, *p++ = cpu_to_be32(gdev->gd_layout_type); - /* If maxcount is 0 then just update notifications */ - if (gdev->gd_maxcount != 0) { - ops = nfsd4_layout_ops[gdev->gd_layout_type]; - nfserr = ops->encode_getdeviceinfo(xdr, gdev); - if (nfserr) { - /* - * We don't bother to burden the layout drivers with - * enforcing gd_maxcount, just tell the client to - * come back with a bigger buffer if it's not enough. - */ - if (xdr->buf->len + 4 > gdev->gd_maxcount) - goto toosmall; - return nfserr; - } + ops = nfsd4_layout_ops[gdev->gd_layout_type]; + nfserr = ops->encode_getdeviceinfo(xdr, gdev); + if (nfserr) { + /* + * We don't bother to burden the layout drivers with + * enforcing gd_maxcount, just tell the client to + * come back with a bigger buffer if it's not enough. + */ + if (xdr->buf->len + 4 > gdev->gd_maxcount) + goto toosmall; + return nfserr; } if (gdev->gd_notify_types) { diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index a8eda1c85829..80621a709510 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -84,11 +84,11 @@ nfsd_hashsize(unsigned int limit) return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE); } -static struct svc_cacherep * -nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum, - struct nfsd_net *nn) +static struct nfsd_cacherep * +nfsd_cacherep_alloc(struct svc_rqst *rqstp, __wsum csum, + struct nfsd_net *nn) { - struct svc_cacherep *rp; + struct nfsd_cacherep *rp; rp = kmem_cache_alloc(drc_slab, GFP_KERNEL); if (rp) { @@ -110,36 +110,64 @@ nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum, return rp; } -static void -nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp, - struct nfsd_net *nn) +static void nfsd_cacherep_free(struct nfsd_cacherep *rp) { - if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) { - nfsd_stats_drc_mem_usage_sub(nn, rp->c_replvec.iov_len); + if (rp->c_type == RC_REPLBUFF) kfree(rp->c_replvec.iov_base); + kmem_cache_free(drc_slab, rp); +} + +static unsigned long +nfsd_cacherep_dispose(struct list_head *dispose) +{ + struct nfsd_cacherep *rp; + unsigned long freed = 0; + + while (!list_empty(dispose)) { + rp = list_first_entry(dispose, struct nfsd_cacherep, c_lru); + list_del(&rp->c_lru); + nfsd_cacherep_free(rp); + freed++; } + return freed; +} + +static void +nfsd_cacherep_unlink_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b, + struct nfsd_cacherep *rp) +{ + if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) + nfsd_stats_drc_mem_usage_sub(nn, rp->c_replvec.iov_len); if (rp->c_state != RC_UNUSED) { rb_erase(&rp->c_node, &b->rb_head); list_del(&rp->c_lru); atomic_dec(&nn->num_drc_entries); nfsd_stats_drc_mem_usage_sub(nn, sizeof(*rp)); } - kmem_cache_free(drc_slab, rp); } static void -nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp, +nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp, + struct nfsd_net *nn) +{ + nfsd_cacherep_unlink_locked(nn, b, rp); + nfsd_cacherep_free(rp); +} + +static void +nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp, struct nfsd_net *nn) { spin_lock(&b->cache_lock); - nfsd_reply_cache_free_locked(b, rp, nn); + nfsd_cacherep_unlink_locked(nn, b, rp); spin_unlock(&b->cache_lock); + nfsd_cacherep_free(rp); } int nfsd_drc_slab_create(void) { drc_slab = kmem_cache_create("nfsd_drc", - sizeof(struct svc_cacherep), 0, 0, NULL); + sizeof(struct nfsd_cacherep), 0, 0, NULL); return drc_slab ? 0: -ENOMEM; } @@ -208,7 +236,7 @@ out_shrinker: void nfsd_reply_cache_shutdown(struct nfsd_net *nn) { - struct svc_cacherep *rp; + struct nfsd_cacherep *rp; unsigned int i; unregister_shrinker(&nn->nfsd_reply_cache_shrinker); @@ -216,7 +244,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) for (i = 0; i < nn->drc_hashsize; i++) { struct list_head *head = &nn->drc_hashtbl[i].lru_head; while (!list_empty(head)) { - rp = list_first_entry(head, struct svc_cacherep, c_lru); + rp = list_first_entry(head, struct nfsd_cacherep, c_lru); nfsd_reply_cache_free_locked(&nn->drc_hashtbl[i], rp, nn); } @@ -233,7 +261,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) * not already scheduled. */ static void -lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) +lru_put_end(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp) { rp->c_timestamp = jiffies; list_move_tail(&rp->c_lru, &b->lru_head); @@ -247,12 +275,21 @@ nfsd_cache_bucket_find(__be32 xid, struct nfsd_net *nn) return &nn->drc_hashtbl[hash]; } -static long prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn, - unsigned int max) +/* + * Remove and return no more than @max expired entries in bucket @b. + * If @max is zero, do not limit the number of removed entries. + */ +static void +nfsd_prune_bucket_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b, + unsigned int max, struct list_head *dispose) { - struct svc_cacherep *rp, *tmp; - long freed = 0; + unsigned long expiry = jiffies - RC_EXPIRE; + struct nfsd_cacherep *rp, *tmp; + unsigned int freed = 0; + + lockdep_assert_held(&b->cache_lock); + /* The bucket LRU is ordered oldest-first. */ list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) { /* * Don't free entries attached to calls that are still @@ -260,60 +297,77 @@ static long prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn, */ if (rp->c_state == RC_INPROG) continue; + if (atomic_read(&nn->num_drc_entries) <= nn->max_drc_entries && - time_before(jiffies, rp->c_timestamp + RC_EXPIRE)) + time_before(expiry, rp->c_timestamp)) break; - nfsd_reply_cache_free_locked(b, rp, nn); - if (max && freed++ > max) + + nfsd_cacherep_unlink_locked(nn, b, rp); + list_add(&rp->c_lru, dispose); + + if (max && ++freed > max) break; } - return freed; } -static long nfsd_prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn) +/** + * nfsd_reply_cache_count - count_objects method for the DRC shrinker + * @shrink: our registered shrinker context + * @sc: garbage collection parameters + * + * Returns the total number of entries in the duplicate reply cache. To + * keep things simple and quick, this is not the number of expired entries + * in the cache (ie, the number that would be removed by a call to + * nfsd_reply_cache_scan). + */ +static unsigned long +nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - return prune_bucket(b, nn, 3); + struct nfsd_net *nn = container_of(shrink, + struct nfsd_net, nfsd_reply_cache_shrinker); + + return atomic_read(&nn->num_drc_entries); } -/* - * Walk the LRU list and prune off entries that are older than RC_EXPIRE. - * Also prune the oldest ones when the total exceeds the max number of entries. +/** + * nfsd_reply_cache_scan - scan_objects method for the DRC shrinker + * @shrink: our registered shrinker context + * @sc: garbage collection parameters + * + * Free expired entries on each bucket's LRU list until we've released + * nr_to_scan freed objects. Nothing will be released if the cache + * has not exceeded it's max_drc_entries limit. + * + * Returns the number of entries released by this call. */ -static long -prune_cache_entries(struct nfsd_net *nn) +static unsigned long +nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { + struct nfsd_net *nn = container_of(shrink, + struct nfsd_net, nfsd_reply_cache_shrinker); + unsigned long freed = 0; + LIST_HEAD(dispose); unsigned int i; - long freed = 0; for (i = 0; i < nn->drc_hashsize; i++) { struct nfsd_drc_bucket *b = &nn->drc_hashtbl[i]; if (list_empty(&b->lru_head)) continue; + spin_lock(&b->cache_lock); - freed += prune_bucket(b, nn, 0); + nfsd_prune_bucket_locked(nn, b, 0, &dispose); spin_unlock(&b->cache_lock); - } - return freed; -} -static unsigned long -nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) -{ - struct nfsd_net *nn = container_of(shrink, - struct nfsd_net, nfsd_reply_cache_shrinker); + freed += nfsd_cacherep_dispose(&dispose); + if (freed > sc->nr_to_scan) + break; + } - return atomic_read(&nn->num_drc_entries); + trace_nfsd_drc_gc(nn, freed); + return freed; } -static unsigned long -nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) -{ - struct nfsd_net *nn = container_of(shrink, - struct nfsd_net, nfsd_reply_cache_shrinker); - - return prune_cache_entries(nn); -} /* * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes */ @@ -348,8 +402,8 @@ nfsd_cache_csum(struct svc_rqst *rqstp) } static int -nfsd_cache_key_cmp(const struct svc_cacherep *key, - const struct svc_cacherep *rp, struct nfsd_net *nn) +nfsd_cache_key_cmp(const struct nfsd_cacherep *key, + const struct nfsd_cacherep *rp, struct nfsd_net *nn) { if (key->c_key.k_xid == rp->c_key.k_xid && key->c_key.k_csum != rp->c_key.k_csum) { @@ -365,11 +419,11 @@ nfsd_cache_key_cmp(const struct svc_cacherep *key, * Must be called with cache_lock held. Returns the found entry or * inserts an empty key on failure. */ -static struct svc_cacherep * -nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key, +static struct nfsd_cacherep * +nfsd_cache_insert(struct nfsd_drc_bucket *b, struct nfsd_cacherep *key, struct nfsd_net *nn) { - struct svc_cacherep *rp, *ret = key; + struct nfsd_cacherep *rp, *ret = key; struct rb_node **p = &b->rb_head.rb_node, *parent = NULL; unsigned int entries = 0; @@ -378,7 +432,7 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key, while (*p != NULL) { ++entries; parent = *p; - rp = rb_entry(parent, struct svc_cacherep, c_node); + rp = rb_entry(parent, struct nfsd_cacherep, c_node); cmp = nfsd_cache_key_cmp(key, rp, nn); if (cmp < 0) @@ -411,6 +465,7 @@ out: /** * nfsd_cache_lookup - Find an entry in the duplicate reply cache * @rqstp: Incoming Call to find + * @cacherep: OUT: DRC entry for this request * * Try to find an entry matching the current call in the cache. When none * is found, we try to grab the oldest expired entry off the LRU list. If @@ -423,16 +478,17 @@ out: * %RC_REPLY: Reply from cache * %RC_DROPIT: Do not process the request further */ -int nfsd_cache_lookup(struct svc_rqst *rqstp) +int nfsd_cache_lookup(struct svc_rqst *rqstp, struct nfsd_cacherep **cacherep) { struct nfsd_net *nn; - struct svc_cacherep *rp, *found; + struct nfsd_cacherep *rp, *found; __wsum csum; struct nfsd_drc_bucket *b; int type = rqstp->rq_cachetype; + unsigned long freed; + LIST_HEAD(dispose); int rtn = RC_DOIT; - rqstp->rq_cacherep = NULL; if (type == RC_NOCACHE) { nfsd_stats_rc_nocache_inc(); goto out; @@ -445,7 +501,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp) * preallocate an entry. */ nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - rp = nfsd_reply_cache_alloc(rqstp, csum, nn); + rp = nfsd_cacherep_alloc(rqstp, csum, nn); if (!rp) goto out; @@ -454,20 +510,18 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp) found = nfsd_cache_insert(b, rp, nn); if (found != rp) goto found_entry; - - nfsd_stats_rc_misses_inc(); - rqstp->rq_cacherep = rp; + *cacherep = rp; rp->c_state = RC_INPROG; + nfsd_prune_bucket_locked(nn, b, 3, &dispose); + spin_unlock(&b->cache_lock); + freed = nfsd_cacherep_dispose(&dispose); + trace_nfsd_drc_gc(nn, freed); + + nfsd_stats_rc_misses_inc(); atomic_inc(&nn->num_drc_entries); nfsd_stats_drc_mem_usage_add(nn, sizeof(*rp)); - - nfsd_prune_bucket(b, nn); - -out_unlock: - spin_unlock(&b->cache_lock); -out: - return rtn; + goto out; found_entry: /* We found a matching entry which is either in progress or done. */ @@ -505,12 +559,16 @@ found_entry: out_trace: trace_nfsd_drc_found(nn, rqstp, rtn); - goto out_unlock; +out_unlock: + spin_unlock(&b->cache_lock); +out: + return rtn; } /** * nfsd_cache_update - Update an entry in the duplicate reply cache. * @rqstp: svc_rqst with a finished Reply + * @rp: IN: DRC entry for this request * @cachetype: which cache to update * @statp: pointer to Reply's NFS status code, or NULL * @@ -528,10 +586,10 @@ out_trace: * nfsd failed to encode a reply that otherwise would have been cached. * In this case, nfsd_cache_update is called with statp == NULL. */ -void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) +void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp, + int cachetype, __be32 *statp) { struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - struct svc_cacherep *rp = rqstp->rq_cacherep; struct kvec *resv = &rqstp->rq_res.head[0], *cachv; struct nfsd_drc_bucket *b; int len; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 1b8b1aab9a15..7ed02fb88a36 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1105,6 +1105,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) if (!nn->nfsd_serv) return -EBUSY; trace_nfsd_end_grace(netns(file)); + nfsd4_end_grace(nn); break; default: return -EINVAL; @@ -1131,7 +1132,7 @@ static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode) /* Following advice from simple_fill_super documentation: */ inode->i_ino = iunique(sb, NFSD_MaxReserved); inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); switch (mode & S_IFMT) { case S_IFDIR: inode->i_fop = &simple_dir_operations; @@ -1626,6 +1627,7 @@ static void __exit exit_nfsd(void) } MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); +MODULE_DESCRIPTION("In-kernel NFS server"); MODULE_LICENSE("GPL"); module_init(init_nfsd) module_exit(exit_nfsd) diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index d88498f8b275..11c14faa6c67 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -96,7 +96,12 @@ int nfsd_pool_stats_open(struct inode *, struct file *); int nfsd_pool_stats_release(struct inode *, struct file *); void nfsd_shutdown_threads(struct net *net); -void nfsd_put(struct net *net); +static inline void nfsd_put(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + svc_put(nn->nfsd_serv); +} bool i_am_nfsd(void); diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index c291389a1d71..355bf0db3235 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -614,7 +614,7 @@ out_negative: * @fhp: file handle to be updated * */ -void fh_fill_pre_attrs(struct svc_fh *fhp) +__be32 __must_check fh_fill_pre_attrs(struct svc_fh *fhp) { bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); struct inode *inode; @@ -622,12 +622,12 @@ void fh_fill_pre_attrs(struct svc_fh *fhp) __be32 err; if (fhp->fh_no_wcc || fhp->fh_pre_saved) - return; + return nfs_ok; inode = d_inode(fhp->fh_dentry); err = fh_getattr(fhp, &stat); if (err) - return; + return err; if (v4) fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); @@ -636,6 +636,7 @@ void fh_fill_pre_attrs(struct svc_fh *fhp) fhp->fh_pre_ctime = stat.ctime; fhp->fh_pre_size = stat.size; fhp->fh_pre_saved = true; + return nfs_ok; } /** @@ -643,26 +644,27 @@ void fh_fill_pre_attrs(struct svc_fh *fhp) * @fhp: file handle to be updated * */ -void fh_fill_post_attrs(struct svc_fh *fhp) +__be32 fh_fill_post_attrs(struct svc_fh *fhp) { bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); struct inode *inode = d_inode(fhp->fh_dentry); __be32 err; if (fhp->fh_no_wcc) - return; + return nfs_ok; if (fhp->fh_post_saved) printk("nfsd: inode locked twice during operation.\n"); err = fh_getattr(fhp, &fhp->fh_post_attr); if (err) - return; + return err; fhp->fh_post_saved = true; if (v4) fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr, inode); + return nfs_ok; } /** @@ -672,16 +674,20 @@ void fh_fill_post_attrs(struct svc_fh *fhp) * This is used when the directory wasn't changed, but wcc attributes * are needed anyway. */ -void fh_fill_both_attrs(struct svc_fh *fhp) +__be32 __must_check fh_fill_both_attrs(struct svc_fh *fhp) { - fh_fill_post_attrs(fhp); - if (!fhp->fh_post_saved) - return; + __be32 err; + + err = fh_fill_post_attrs(fhp); + if (err) + return err; + fhp->fh_pre_change = fhp->fh_post_change; fhp->fh_pre_mtime = fhp->fh_post_attr.mtime; fhp->fh_pre_ctime = fhp->fh_post_attr.ctime; fhp->fh_pre_size = fhp->fh_post_attr.size; fhp->fh_pre_saved = true; + return nfs_ok; } /* diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 4e0ecf0ae2cf..40426f899e76 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -294,7 +294,7 @@ static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp) } u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode); -extern void fh_fill_pre_attrs(struct svc_fh *fhp); -extern void fh_fill_post_attrs(struct svc_fh *fhp); -extern void fh_fill_both_attrs(struct svc_fh *fhp); +__be32 __must_check fh_fill_pre_attrs(struct svc_fh *fhp); +__be32 fh_fill_post_attrs(struct svc_fh *fhp); +__be32 __must_check fh_fill_both_attrs(struct svc_fh *fhp); #endif /* _LINUX_NFSD_NFSFH_H */ diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 2154fa63c5f2..c7af1095f6b5 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -542,9 +542,14 @@ static struct notifier_block nfsd_inet6addr_notifier = { /* Only used under nfsd_mutex, so this atomic may be overkill: */ static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0); -static void nfsd_last_thread(struct svc_serv *serv, struct net *net) +static void nfsd_last_thread(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_serv *serv = nn->nfsd_serv; + + spin_lock(&nfsd_notifier_lock); + nn->nfsd_serv = NULL; + spin_unlock(&nfsd_notifier_lock); /* check if the notifier still has clients */ if (atomic_dec_return(&nfsd_notifier_refcount) == 0) { @@ -554,6 +559,8 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net) #endif } + svc_xprt_destroy_all(serv, net); + /* * write_ports can create the server without actually starting * any threads--if we get shut down before any threads are @@ -644,7 +651,8 @@ void nfsd_shutdown_threads(struct net *net) svc_get(serv); /* Kill outstanding nfsd threads */ svc_set_num_threads(serv, NULL, 0); - nfsd_put(net); + nfsd_last_thread(net); + svc_put(serv); mutex_unlock(&nfsd_mutex); } @@ -674,9 +682,6 @@ int nfsd_create_serv(struct net *net) serv->sv_maxconn = nn->max_connections; error = svc_bind(serv, net); if (error < 0) { - /* NOT nfsd_put() as notifiers (see below) haven't - * been set up yet. - */ svc_put(serv); return error; } @@ -719,29 +724,6 @@ int nfsd_get_nrthreads(int n, int *nthreads, struct net *net) return 0; } -/* This is the callback for kref_put() below. - * There is no code here as the first thing to be done is - * call svc_shutdown_net(), but we cannot get the 'net' from - * the kref. So do all the work when kref_put returns true. - */ -static void nfsd_noop(struct kref *ref) -{ -} - -void nfsd_put(struct net *net) -{ - struct nfsd_net *nn = net_generic(net, nfsd_net_id); - - if (kref_put(&nn->nfsd_serv->sv_refcnt, nfsd_noop)) { - svc_xprt_destroy_all(nn->nfsd_serv, net); - nfsd_last_thread(nn->nfsd_serv, net); - svc_destroy(&nn->nfsd_serv->sv_refcnt); - spin_lock(&nfsd_notifier_lock); - nn->nfsd_serv = NULL; - spin_unlock(&nfsd_notifier_lock); - } -} - int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) { int i = 0; @@ -792,7 +774,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) if (err) break; } - nfsd_put(net); + svc_put(nn->nfsd_serv); return err; } @@ -807,6 +789,7 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) int error; bool nfsd_up_before; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_serv *serv; mutex_lock(&nfsd_mutex); dprintk("nfsd: creating service\n"); @@ -826,22 +809,25 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) goto out; nfsd_up_before = nn->nfsd_net_up; + serv = nn->nfsd_serv; error = nfsd_startup_net(net, cred); if (error) goto out_put; - error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs); + error = svc_set_num_threads(serv, NULL, nrservs); if (error) goto out_shutdown; - error = nn->nfsd_serv->sv_nrthreads; + error = serv->sv_nrthreads; + if (error == 0) + nfsd_last_thread(net); out_shutdown: if (error < 0 && !nfsd_up_before) nfsd_shutdown_net(net); out_put: /* Threads now hold service active */ if (xchg(&nn->keep_active, 0)) - nfsd_put(net); - nfsd_put(net); + svc_put(serv); + svc_put(serv); out: mutex_unlock(&nfsd_mutex); return error; @@ -953,7 +939,6 @@ nfsd(void *vrqstp) struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list); struct net *net = perm_sock->xpt_net; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - int err; /* At this point, the thread shares current->fs * with the init process. We need to create files with the @@ -965,15 +950,6 @@ nfsd(void *vrqstp) current->fs->umask = 0; - /* - * thread is spawned with all signals set to SIG_IGN, re-enable - * the ones that will bring down the thread - */ - allow_signal(SIGKILL); - allow_signal(SIGHUP); - allow_signal(SIGINT); - allow_signal(SIGQUIT); - atomic_inc(&nfsdstats.th_cnt); set_freezable(); @@ -981,54 +957,19 @@ nfsd(void *vrqstp) /* * The main request loop */ - for (;;) { + while (!kthread_should_stop()) { /* Update sv_maxconn if it has changed */ rqstp->rq_server->sv_maxconn = nn->max_connections; - /* - * Find a socket with data available and call its - * recvfrom routine. - */ - while ((err = svc_recv(rqstp, 60*60*HZ)) == -EAGAIN) - ; - if (err == -EINTR) - break; - validate_process_creds(); - svc_process(rqstp); + svc_recv(rqstp); validate_process_creds(); } - /* Clear signals before calling svc_exit_thread() */ - flush_signals(current); - atomic_dec(&nfsdstats.th_cnt); out: - /* Take an extra ref so that the svc_put in svc_exit_thread() - * doesn't call svc_destroy() - */ - svc_get(nn->nfsd_serv); - /* Release the thread */ svc_exit_thread(rqstp); - - /* We need to drop a ref, but may not drop the last reference - * without holding nfsd_mutex, and we cannot wait for nfsd_mutex as that - * could deadlock with nfsd_shutdown_threads() waiting for us. - * So three options are: - * - drop a non-final reference, - * - get the mutex without waiting - * - sleep briefly andd try the above again - */ - while (!svc_put_not_last(nn->nfsd_serv)) { - if (mutex_trylock(&nfsd_mutex)) { - nfsd_put(net); - mutex_unlock(&nfsd_mutex); - break; - } - msleep(20); - } - return 0; } @@ -1046,6 +987,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) { const struct svc_procedure *proc = rqstp->rq_procinfo; __be32 *statp = rqstp->rq_accept_statp; + struct nfsd_cacherep *rp; /* * Give the xdr decoder a chance to change this if it wants @@ -1056,7 +998,8 @@ int nfsd_dispatch(struct svc_rqst *rqstp) if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream)) goto out_decode_err; - switch (nfsd_cache_lookup(rqstp)) { + rp = NULL; + switch (nfsd_cache_lookup(rqstp, &rp)) { case RC_DOIT: break; case RC_REPLY: @@ -1072,7 +1015,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream)) goto out_encode_err; - nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1); + nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1); out_cached_reply: return 1; @@ -1082,13 +1025,13 @@ out_decode_err: return 1; out_update_drop: - nfsd_cache_update(rqstp, RC_NOCACHE, NULL); + nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL); out_dropit: return 0; out_encode_err: trace_nfsd_cant_encode_err(rqstp); - nfsd_cache_update(rqstp, RC_NOCACHE, NULL); + nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL); *statp = rpc_system_err; return 1; } @@ -1139,11 +1082,12 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file) int nfsd_pool_stats_release(struct inode *inode, struct file *file) { + struct seq_file *seq = file->private_data; + struct svc_serv *serv = seq->private; int ret = seq_release(inode, file); - struct net *net = inode->i_sb->s_fs_info; mutex_lock(&nfsd_mutex); - nfsd_put(net); + svc_put(serv); mutex_unlock(&nfsd_mutex); return ret; } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index d49d3060ed4f..cbddcf484dba 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -732,4 +732,7 @@ static inline bool try_to_expire_client(struct nfs4_client *clp) cmpxchg(&clp->cl_state, NFSD4_COURTESY, NFSD4_EXPIRABLE); return clp->cl_state == NFSD4_EXPIRABLE; } + +extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, + struct inode *inode); #endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 777e24e5da33..63797635e1c3 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -65,6 +65,8 @@ static int nfsd_show(struct seq_file *seq, void *v) seq_printf(seq, " %lld", percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_NFS4_OP(i)])); } + seq_printf(seq, "\nwdeleg_getattr %lld", + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR])); seq_putc(seq, '\n'); #endif diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index 9b43dc3d9991..cf5524e7ca06 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -22,6 +22,7 @@ enum { NFSD_STATS_FIRST_NFS4_OP, /* count of individual nfsv4 operations */ NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP, #define NFSD_STATS_NFS4_OP(op) (NFSD_STATS_FIRST_NFS4_OP + (op)) + NFSD_STATS_WDELEG_GETATTR, /* count of getattr conflict with wdeleg */ #endif NFSD_STATS_COUNTERS_NUM }; @@ -93,4 +94,10 @@ static inline void nfsd_stats_drc_mem_usage_sub(struct nfsd_net *nn, s64 amount) percpu_counter_sub(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount); } +#ifdef CONFIG_NFSD_V4 +static inline void nfsd_stats_wdeleg_getattr_inc(void) +{ + percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR]); +} +#endif #endif /* _NFSD_STATS_H */ diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 2af74983f146..803904348871 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -607,6 +607,7 @@ DEFINE_STATEID_EVENT(layout_recall_release); DEFINE_STATEID_EVENT(open); DEFINE_STATEID_EVENT(deleg_read); +DEFINE_STATEID_EVENT(deleg_write); DEFINE_STATEID_EVENT(deleg_return); DEFINE_STATEID_EVENT(deleg_recall); @@ -1240,8 +1241,8 @@ TRACE_EVENT(nfsd_drc_found, TRACE_EVENT(nfsd_drc_mismatch, TP_PROTO( const struct nfsd_net *nn, - const struct svc_cacherep *key, - const struct svc_cacherep *rp + const struct nfsd_cacherep *key, + const struct nfsd_cacherep *rp ), TP_ARGS(nn, key, rp), TP_STRUCT__entry( @@ -1261,6 +1262,28 @@ TRACE_EVENT(nfsd_drc_mismatch, __entry->ingress) ); +TRACE_EVENT_CONDITION(nfsd_drc_gc, + TP_PROTO( + const struct nfsd_net *nn, + unsigned long freed + ), + TP_ARGS(nn, freed), + TP_CONDITION(freed > 0), + TP_STRUCT__entry( + __field(unsigned long long, boot_time) + __field(unsigned long, freed) + __field(int, total) + ), + TP_fast_assign( + __entry->boot_time = nn->boot_time; + __entry->freed = freed; + __entry->total = atomic_read(&nn->num_drc_entries); + ), + TP_printk("boot_time=%16llx total=%d freed=%lu", + __entry->boot_time, __entry->total, __entry->freed + ) +); + TRACE_EVENT(nfsd_cb_args, TP_PROTO( const struct nfs4_client *clp, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 2c9074ab2315..48260cf68fde 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -520,7 +520,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, nfsd_sanitize_attrs(inode, iap); - if (check_guard && guardtime != inode->i_ctime.tv_sec) + if (check_guard && guardtime != inode_get_ctime(inode).tv_sec) return nfserr_notsync; /* @@ -1540,7 +1540,9 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, dput(dchild); if (err) goto out_unlock; - fh_fill_pre_attrs(fhp); + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; err = nfsd_create_locked(rqstp, fhp, attrs, type, rdev, resfhp); fh_fill_post_attrs(fhp); out_unlock: @@ -1635,13 +1637,16 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, inode_unlock(dentry->d_inode); goto out_drop_write; } - fh_fill_pre_attrs(fhp); + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; host_err = vfs_symlink(&nop_mnt_idmap, d_inode(dentry), dnew, path); err = nfserrno(host_err); cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); if (!err) nfsd_create_setattr(rqstp, fhp, resfhp, attrs); fh_fill_post_attrs(fhp); +out_unlock: inode_unlock(dentry->d_inode); if (!err) err = nfserrno(commit_metadata(fhp)); @@ -1703,7 +1708,9 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, err = nfserr_noent; if (d_really_is_negative(dold)) goto out_dput; - fh_fill_pre_attrs(ffhp); + err = fh_fill_pre_attrs(ffhp); + if (err != nfs_ok) + goto out_dput; host_err = vfs_link(dold, &nop_mnt_idmap, dirp, dnew, NULL); fh_fill_post_attrs(ffhp); inode_unlock(dirp); @@ -1789,8 +1796,12 @@ retry: } trap = lock_rename(tdentry, fdentry); - fh_fill_pre_attrs(ffhp); - fh_fill_pre_attrs(tfhp); + err = fh_fill_pre_attrs(ffhp); + if (err != nfs_ok) + goto out_unlock; + err = fh_fill_pre_attrs(tfhp); + if (err != nfs_ok) + goto out_unlock; odentry = lookup_one_len(fname, fdentry, flen); host_err = PTR_ERR(odentry); @@ -1857,6 +1868,7 @@ retry: fh_fill_post_attrs(ffhp); fh_fill_post_attrs(tfhp); } +out_unlock: unlock_rename(tdentry, fdentry); fh_drop_write(ffhp); @@ -1916,12 +1928,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, goto out_unlock; } rinode = d_inode(rdentry); - ihold(rinode); + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; + ihold(rinode); if (!type) type = d_inode(rdentry)->i_mode & S_IFMT; - fh_fill_pre_attrs(fhp); if (type != S_IFDIR) { int retries; @@ -2341,16 +2355,18 @@ nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name) return nfserrno(ret); inode_lock(fhp->fh_dentry->d_inode); - fh_fill_pre_attrs(fhp); - + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; ret = __vfs_removexattr_locked(&nop_mnt_idmap, fhp->fh_dentry, name, NULL); - + err = nfsd_xattr_errno(ret); fh_fill_post_attrs(fhp); +out_unlock: inode_unlock(fhp->fh_dentry->d_inode); fh_drop_write(fhp); - return nfsd_xattr_errno(ret); + return err; } __be32 @@ -2368,15 +2384,17 @@ nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, if (ret) return nfserrno(ret); inode_lock(fhp->fh_dentry->d_inode); - fh_fill_pre_attrs(fhp); - - ret = __vfs_setxattr_locked(&nop_mnt_idmap, fhp->fh_dentry, name, buf, - len, flags, NULL); + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; + ret = __vfs_setxattr_locked(&nop_mnt_idmap, fhp->fh_dentry, + name, buf, len, flags, NULL); fh_fill_post_attrs(fhp); + err = nfsd_xattr_errno(ret); +out_unlock: inode_unlock(fhp->fh_dentry->d_inode); fh_drop_write(fhp); - - return nfsd_xattr_errno(ret); + return err; } #endif diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 510978e602da..9d918a79dc16 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -774,17 +774,6 @@ void warn_on_nonidempotent_op(struct nfsd4_op *op); #define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs) -static inline void -set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) -{ - BUG_ON(!fhp->fh_pre_saved); - cinfo->atomic = (u32)(fhp->fh_post_saved && !fhp->fh_no_atomic_attr); - - cinfo->before_change = fhp->fh_pre_change; - cinfo->after_change = fhp->fh_post_change; -} - - bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp); bool nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr); diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig index 7d59567465e1..7dae168e346e 100644 --- a/fs/nilfs2/Kconfig +++ b/fs/nilfs2/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config NILFS2_FS tristate "NILFS2 file system support" + select BUFFER_HEAD select CRC32 select LEGACY_DIRECT_IO help diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 6ce8617b562d..7342de296ec3 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -205,7 +205,8 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff, int ret; spin_lock(lock); - if (prev->bh && blkoff == prev->blkoff) { + if (prev->bh && blkoff == prev->blkoff && + likely(buffer_uptodate(prev->bh))) { get_bh(prev->bh); *bhp = prev->bh; spin_unlock(lock); diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index decd6471300b..bce734b68f08 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -429,7 +429,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, nilfs_set_de_type(de, inode); nilfs_commit_chunk(page, mapping, from, to); nilfs_put_page(page); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); } /* @@ -519,7 +519,7 @@ got_it: de->inode = cpu_to_le64(inode->i_ino); nilfs_set_de_type(de, inode); nilfs_commit_chunk(page, page->mapping, from, to); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); nilfs_mark_inode_dirty(dir); /* OFFSET_CACHE */ out_put: @@ -567,7 +567,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) pde->rec_len = nilfs_rec_len_to_disk(to - from); dir->inode = 0; nilfs_commit_chunk(page, mapping, from, to); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); out: nilfs_put_page(page); return err; diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index a9eb3487efb2..740ce26d1e76 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -108,7 +108,7 @@ static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf) wait_for_stable_page(page); out: sb_end_pagefault(inode->i_sb); - return block_page_mkwrite_return(ret); + return vmf_fs_error(ret); } static const struct vm_operations_struct nilfs_file_vm_ops = { diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 35bc79305318..1a8bd5993476 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -366,7 +366,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) atomic64_inc(&root->inodes_count); inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = ino; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { err = nilfs_bmap_read(ii->i_bmap, NULL); @@ -450,10 +450,10 @@ int nilfs_read_inode_common(struct inode *inode, set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le64_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); - inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); + inode_set_ctime(inode, le64_to_cpu(raw_inode->i_ctime), + le32_to_cpu(raw_inode->i_ctime_nsec)); inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime); inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); - inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); if (nilfs_is_metadata_file_inode(inode) && !S_ISREG(inode->i_mode)) return -EIO; /* this inode is for metadata and corrupted */ @@ -768,9 +768,9 @@ void nilfs_write_inode_common(struct inode *inode, raw_inode->i_gid = cpu_to_le32(i_gid_read(inode)); raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); raw_inode->i_size = cpu_to_le64(inode->i_size); - raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + raw_inode->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); - raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + raw_inode->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); raw_inode->i_blocks = cpu_to_le64(inode->i_blocks); @@ -875,7 +875,7 @@ void nilfs_truncate(struct inode *inode) nilfs_truncate_bmap(ii, blkoff); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); if (IS_SYNC(inode)) nilfs_set_transaction_flag(NILFS_TI_SYNC); @@ -1025,7 +1025,7 @@ int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh) int err; spin_lock(&nilfs->ns_inode_lock); - if (ii->i_bh == NULL) { + if (ii->i_bh == NULL || unlikely(!buffer_uptodate(ii->i_bh))) { spin_unlock(&nilfs->ns_inode_lock); err = nilfs_ifile_get_inode_block(ii->i_root->ifile, inode->i_ino, pbh); @@ -1034,7 +1034,10 @@ int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh) spin_lock(&nilfs->ns_inode_lock); if (ii->i_bh == NULL) ii->i_bh = *pbh; - else { + else if (unlikely(!buffer_uptodate(ii->i_bh))) { + __brelse(ii->i_bh); + ii->i_bh = *pbh; + } else { brelse(*pbh); *pbh = ii->i_bh; } diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 1dfbc0c34513..40ffade49f38 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -149,7 +149,7 @@ int nilfs_fileattr_set(struct mnt_idmap *idmap, NILFS_I(inode)->i_flags = oldflags | (flags & FS_FL_USER_MODIFIABLE); nilfs_set_inode_flags(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (IS_SYNC(inode)) nilfs_set_transaction_flag(NILFS_TI_SYNC); diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index c7024da8f1e2..2a4e7f4a8102 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -185,7 +185,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir, if (err) return err; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_link_count(inode); ihold(inode); @@ -283,7 +283,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) if (err) goto out; - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); drop_nlink(inode); err = 0; out: @@ -387,7 +387,7 @@ static int nilfs_rename(struct mnt_idmap *idmap, goto out_dir; nilfs_set_link(new_dir, new_de, new_page, old_inode); nilfs_mark_inode_dirty(new_dir); - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); if (dir_de) drop_nlink(new_inode); drop_nlink(new_inode); @@ -406,7 +406,7 @@ static int nilfs_rename(struct mnt_idmap *idmap, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old_inode->i_ctime = current_time(old_inode); + inode_set_ctime_current(old_inode); nilfs_delete_entry(old_de, old_page); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 581691e4be49..7ec16879756e 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -725,6 +725,11 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, struct folio *folio = fbatch.folios[i]; folio_lock(folio); + if (unlikely(folio->mapping != mapping)) { + /* Exclude folios removed from the address space */ + folio_unlock(folio); + continue; + } head = folio_buffers(folio); if (!head) { create_empty_buffers(&folio->page, i_blocksize(inode), 0); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 0ef8c71bde8e..a5d1fa4e7552 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -35,6 +35,7 @@ #include <linux/writeback.h> #include <linux/seq_file.h> #include <linux/mount.h> +#include <linux/fs_context.h> #include "nilfs.h" #include "export.h" #include "mdt.h" @@ -1216,7 +1217,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) } struct nilfs_super_data { - struct block_device *bdev; __u64 cno; int flags; }; @@ -1283,64 +1283,49 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd) static int nilfs_set_bdev_super(struct super_block *s, void *data) { - s->s_bdev = data; - s->s_dev = s->s_bdev->bd_dev; + s->s_dev = *(dev_t *)data; return 0; } static int nilfs_test_bdev_super(struct super_block *s, void *data) { - return (void *)s->s_bdev == data; + return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data; } static struct dentry * nilfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - struct nilfs_super_data sd; + struct nilfs_super_data sd = { .flags = flags }; struct super_block *s; - struct dentry *root_dentry; - int err, s_new = false; + dev_t dev; + int err; - sd.bdev = blkdev_get_by_path(dev_name, sb_open_mode(flags), fs_type, - NULL); - if (IS_ERR(sd.bdev)) - return ERR_CAST(sd.bdev); + if (nilfs_identify(data, &sd)) + return ERR_PTR(-EINVAL); - sd.cno = 0; - sd.flags = flags; - if (nilfs_identify((char *)data, &sd)) { - err = -EINVAL; - goto failed; - } + err = lookup_bdev(dev_name, &dev); + if (err) + return ERR_PTR(err); - /* - * once the super is inserted into the list by sget, s_umount - * will protect the lockfs code from trying to start a snapshot - * while we are mounting - */ - mutex_lock(&sd.bdev->bd_fsfreeze_mutex); - if (sd.bdev->bd_fsfreeze_count > 0) { - mutex_unlock(&sd.bdev->bd_fsfreeze_mutex); - err = -EBUSY; - goto failed; - } s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, flags, - sd.bdev); - mutex_unlock(&sd.bdev->bd_fsfreeze_mutex); - if (IS_ERR(s)) { - err = PTR_ERR(s); - goto failed; - } + &dev); + if (IS_ERR(s)) + return ERR_CAST(s); if (!s->s_root) { - s_new = true; - - /* New superblock instance created */ - snprintf(s->s_id, sizeof(s->s_id), "%pg", sd.bdev); - sb_set_blocksize(s, block_size(sd.bdev)); - - err = nilfs_fill_super(s, data, flags & SB_SILENT ? 1 : 0); + /* + * We drop s_umount here because we need to open the bdev and + * bdev->open_mutex ranks above s_umount (blkdev_put() -> + * __invalidate_device()). It is safe because we have active sb + * reference and SB_BORN is not set yet. + */ + up_write(&s->s_umount); + err = setup_bdev_super(s, flags, NULL); + down_write(&s->s_umount); + if (!err) + err = nilfs_fill_super(s, data, + flags & SB_SILENT ? 1 : 0); if (err) goto failed_super; @@ -1366,24 +1351,18 @@ nilfs_mount(struct file_system_type *fs_type, int flags, } if (sd.cno) { + struct dentry *root_dentry; + err = nilfs_attach_snapshot(s, sd.cno, &root_dentry); if (err) goto failed_super; - } else { - root_dentry = dget(s->s_root); + return root_dentry; } - if (!s_new) - blkdev_put(sd.bdev, fs_type); - - return root_dentry; + return dget(s->s_root); failed_super: deactivate_locked_super(s); - - failed: - if (!s_new) - blkdev_put(sd.bdev, fs_type); return ERR_PTR(err); } diff --git a/fs/nls/Kconfig b/fs/nls/Kconfig index c7857e36adbb..2a601af6f3bd 100644 --- a/fs/nls/Kconfig +++ b/fs/nls/Kconfig @@ -617,4 +617,7 @@ config NLS_UTF8 input/output character sets. Say Y here for the UTF-8 encoding of the Unicode/ISO9646 universal character set. +config NLS_UCS2_UTILS + tristate + endif # NLS diff --git a/fs/nls/Makefile b/fs/nls/Makefile index ac54db297128..5062c699d041 100644 --- a/fs/nls/Makefile +++ b/fs/nls/Makefile @@ -54,3 +54,4 @@ obj-$(CONFIG_NLS_MAC_INUIT) += mac-inuit.o obj-$(CONFIG_NLS_MAC_ROMANIAN) += mac-romanian.o obj-$(CONFIG_NLS_MAC_ROMAN) += mac-roman.o obj-$(CONFIG_NLS_MAC_TURKISH) += mac-turkish.o +obj-$(CONFIG_NLS_UCS2_UTILS) += nls_ucs2_utils.o diff --git a/fs/nls/nls_ucs2_data.h b/fs/nls/nls_ucs2_data.h new file mode 100644 index 000000000000..1f454dc0f4e0 --- /dev/null +++ b/fs/nls/nls_ucs2_data.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _NLS_UCS2_DATA_H +#define _NLS_UCS2_DATA_H + +struct UniCaseRange { + wchar_t start; + wchar_t end; + signed char *table; +}; + +extern signed char NlsUniUpperTable[512]; +extern const struct UniCaseRange NlsUniUpperRange[]; + +#endif /* _NLS_UCS2_DATA_H */ diff --git a/fs/smb/server/uniupr.h b/fs/nls/nls_ucs2_utils.c index 26583b776897..a69781c54dd8 100644 --- a/fs/smb/server/uniupr.h +++ b/fs/nls/nls_ucs2_utils.c @@ -1,19 +1,27 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Some of the source code in this file came from fs/cifs/uniupr.h * Copyright (c) International Business Machines Corp., 2000,2002 * - * uniupr.h - Unicode compressed case ranges + * Some of the source code in this file came from fs/cifs/cifs_unicode.c + * + * Copyright (c) International Business Machines Corp., 2000,2009 + * Modified by Steve French (sfrench@us.ibm.com) + * Modified by Namjae Jeon (linkinjeon@kernel.org) * */ -#ifndef __KSMBD_UNIUPR_H -#define __KSMBD_UNIUPR_H +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <asm/unaligned.h> +#include "nls_ucs2_utils.h" + +MODULE_LICENSE("GPL"); -#ifndef UNIUPR_NOUPPER /* * Latin upper case */ -signed char SmbUniUpperTable[512] = { +signed char NlsUniUpperTable[512] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ @@ -51,6 +59,7 @@ signed char SmbUniUpperTable[512] = { 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e0-1ef */ 0, 0, -1, -2, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, /* 1f0-1ff */ }; +EXPORT_SYMBOL_GPL(NlsUniUpperTable); /* Upper case range - Greek */ static signed char UniCaseRangeU03a0[47] = { @@ -126,7 +135,7 @@ static signed char UniCaseRangeUff40[27] = { /* * Upper Case Range */ -const struct UniCaseRange SmbUniUpperRange[] = { +const struct UniCaseRange NlsUniUpperRange[] = { {0x03a0, 0x03ce, UniCaseRangeU03a0}, {0x0430, 0x045f, UniCaseRangeU0430}, {0x0490, 0x04cc, UniCaseRangeU0490}, @@ -134,135 +143,4 @@ const struct UniCaseRange SmbUniUpperRange[] = { {0xff40, 0xff5a, UniCaseRangeUff40}, {0} }; -#endif - -#ifndef UNIUPR_NOLOWER -/* - * Latin lower case - */ -signed char CifsUniLowerTable[512] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */ - 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, /* 040-04f */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, - 0, 0, 0, /* 050-05f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 060-06f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 070-07f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, /* 0c0-0cf */ - 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, - 32, 32, 32, 0, /* 0d0-0df */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0e0-0ef */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0f0-0ff */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 100-10f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 110-11f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 120-12f */ - 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, /* 130-13f */ - 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, /* 140-14f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 150-15f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 160-16f */ - 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, - 0, /* 170-17f */ - 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 79, - 0, /* 180-18f */ - 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, /* 190-19f */ - 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, /* 1a0-1af */ - 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, /* 1b0-1bf */ - 0, 0, 0, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 1, 0, 1, /* 1c0-1cf */ - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, /* 1d0-1df */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e0-1ef */ - 0, 2, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1f0-1ff */ -}; - -/* Lower case range - Greek */ -static signed char UniCaseRangeL0380[44] = { - 0, 0, 0, 0, 0, 0, 38, 0, 37, 37, 37, 0, 64, 0, 63, 63, /* 380-38f */ - 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, /* 390-39f */ - 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, -}; - -/* Lower case range - Cyrillic */ -static signed char UniCaseRangeL0400[48] = { - 0, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, - 0, 80, 80, /* 400-40f */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, /* 410-41f */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, /* 420-42f */ -}; - -/* Lower case range - Extended cyrillic */ -static signed char UniCaseRangeL0490[60] = { - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 490-49f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 4a0-4af */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 4b0-4bf */ - 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, -}; - -/* Lower case range - Extended latin and greek */ -static signed char UniCaseRangeL1e00[504] = { - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e00-1e0f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e10-1e1f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e20-1e2f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e30-1e3f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e40-1e4f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e50-1e5f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e60-1e6f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e70-1e7f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e80-1e8f */ - 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, /* 1e90-1e9f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ea0-1eaf */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1eb0-1ebf */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ec0-1ecf */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ed0-1edf */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ee0-1eef */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f00-1f0f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0, /* 1f10-1f1f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f20-1f2f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f30-1f3f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0, /* 1f40-1f4f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, -8, 0, -8, 0, -8, 0, -8, /* 1f50-1f5f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f60-1f6f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f70-1f7f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f80-1f8f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f90-1f9f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1fa0-1faf */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -74, -74, -9, 0, 0, 0, /* 1fb0-1fbf */ - 0, 0, 0, 0, 0, 0, 0, 0, -86, -86, -86, -86, -9, 0, - 0, 0, /* 1fc0-1fcf */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -100, -100, 0, 0, 0, 0, /* 1fd0-1fdf */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -112, -112, -7, 0, - 0, 0, /* 1fe0-1fef */ - 0, 0, 0, 0, 0, 0, 0, 0, -}; - -/* Lower case range - Wide latin */ -static signed char UniCaseRangeLff20[27] = { - 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, /* ff20-ff2f */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, -}; - -/* - * Lower Case Range - */ -const struct UniCaseRange CifsUniLowerRange[] = { - {0x0380, 0x03ab, UniCaseRangeL0380}, - {0x0400, 0x042f, UniCaseRangeL0400}, - {0x0490, 0x04cb, UniCaseRangeL0490}, - {0x1e00, 0x1ff7, UniCaseRangeL1e00}, - {0xff20, 0xff3a, UniCaseRangeLff20}, - {0} -}; -#endif - -#endif /* __KSMBD_UNIUPR_H */ +EXPORT_SYMBOL_GPL(NlsUniUpperRange); diff --git a/fs/nls/nls_ucs2_utils.h b/fs/nls/nls_ucs2_utils.h new file mode 100644 index 000000000000..ef18d30db1d0 --- /dev/null +++ b/fs/nls/nls_ucs2_utils.h @@ -0,0 +1,285 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Some of the source code in this file came from fs/cifs/cifs_unicode.c + * and then via server/unicode.c + * cifs_unicode: Unicode kernel case support + * + * Function: + * Convert a unicode character to upper or lower case using + * compressed tables. + * + * Copyright (c) International Business Machines Corp., 2000,2009 + * + * + * Notes: + * These APIs are based on the C library functions. The semantics + * should match the C functions but with expanded size operands. + * + * The upper/lower functions are based on a table created by mkupr. + * This is a compressed table of upper and lower case conversion. + * + */ +#ifndef _NLS_UCS2_UTILS_H +#define _NLS_UCS2_UTILS_H + +#include <asm/byteorder.h> +#include <linux/types.h> +#include <linux/nls.h> +#include <linux/unicode.h> +#include "nls_ucs2_data.h" + +/* + * Windows maps these to the user defined 16 bit Unicode range since they are + * reserved symbols (along with \ and /), otherwise illegal to store + * in filenames in NTFS + */ +#define UNI_ASTERISK ((__u16)('*' + 0xF000)) +#define UNI_QUESTION ((__u16)('?' + 0xF000)) +#define UNI_COLON ((__u16)(':' + 0xF000)) +#define UNI_GRTRTHAN ((__u16)('>' + 0xF000)) +#define UNI_LESSTHAN ((__u16)('<' + 0xF000)) +#define UNI_PIPE ((__u16)('|' + 0xF000)) +#define UNI_SLASH ((__u16)('\\' + 0xF000)) + +/* + * UniStrcat: Concatenate the second string to the first + * + * Returns: + * Address of the first string + */ +static inline wchar_t *UniStrcat(wchar_t *ucs1, const wchar_t *ucs2) +{ + wchar_t *anchor = ucs1; /* save a pointer to start of ucs1 */ + + while (*ucs1++) + /*NULL*/; /* To end of first string */ + ucs1--; /* Return to the null */ + while ((*ucs1++ = *ucs2++)) + /*NULL*/; /* copy string 2 over */ + return anchor; +} + +/* + * UniStrchr: Find a character in a string + * + * Returns: + * Address of first occurrence of character in string + * or NULL if the character is not in the string + */ +static inline wchar_t *UniStrchr(const wchar_t *ucs, wchar_t uc) +{ + while ((*ucs != uc) && *ucs) + ucs++; + + if (*ucs == uc) + return (wchar_t *)ucs; + return NULL; +} + +/* + * UniStrcmp: Compare two strings + * + * Returns: + * < 0: First string is less than second + * = 0: Strings are equal + * > 0: First string is greater than second + */ +static inline int UniStrcmp(const wchar_t *ucs1, const wchar_t *ucs2) +{ + while ((*ucs1 == *ucs2) && *ucs1) { + ucs1++; + ucs2++; + } + return (int)*ucs1 - (int)*ucs2; +} + +/* + * UniStrcpy: Copy a string + */ +static inline wchar_t *UniStrcpy(wchar_t *ucs1, const wchar_t *ucs2) +{ + wchar_t *anchor = ucs1; /* save the start of result string */ + + while ((*ucs1++ = *ucs2++)) + /*NULL*/; + return anchor; +} + +/* + * UniStrlen: Return the length of a string (in 16 bit Unicode chars not bytes) + */ +static inline size_t UniStrlen(const wchar_t *ucs1) +{ + int i = 0; + + while (*ucs1++) + i++; + return i; +} + +/* + * UniStrnlen: Return the length (in 16 bit Unicode chars not bytes) of a + * string (length limited) + */ +static inline size_t UniStrnlen(const wchar_t *ucs1, int maxlen) +{ + int i = 0; + + while (*ucs1++) { + i++; + if (i >= maxlen) + break; + } + return i; +} + +/* + * UniStrncat: Concatenate length limited string + */ +static inline wchar_t *UniStrncat(wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + wchar_t *anchor = ucs1; /* save pointer to string 1 */ + + while (*ucs1++) + /*NULL*/; + ucs1--; /* point to null terminator of s1 */ + while (n-- && (*ucs1 = *ucs2)) { /* copy s2 after s1 */ + ucs1++; + ucs2++; + } + *ucs1 = 0; /* Null terminate the result */ + return anchor; +} + +/* + * UniStrncmp: Compare length limited string + */ +static inline int UniStrncmp(const wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + if (!n) + return 0; /* Null strings are equal */ + while ((*ucs1 == *ucs2) && *ucs1 && --n) { + ucs1++; + ucs2++; + } + return (int)*ucs1 - (int)*ucs2; +} + +/* + * UniStrncmp_le: Compare length limited string - native to little-endian + */ +static inline int +UniStrncmp_le(const wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + if (!n) + return 0; /* Null strings are equal */ + while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) { + ucs1++; + ucs2++; + } + return (int)*ucs1 - (int)__le16_to_cpu(*ucs2); +} + +/* + * UniStrncpy: Copy length limited string with pad + */ +static inline wchar_t *UniStrncpy(wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + wchar_t *anchor = ucs1; + + while (n-- && *ucs2) /* Copy the strings */ + *ucs1++ = *ucs2++; + + n++; + while (n--) /* Pad with nulls */ + *ucs1++ = 0; + return anchor; +} + +/* + * UniStrncpy_le: Copy length limited string with pad to little-endian + */ +static inline wchar_t *UniStrncpy_le(wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + wchar_t *anchor = ucs1; + + while (n-- && *ucs2) /* Copy the strings */ + *ucs1++ = __le16_to_cpu(*ucs2++); + + n++; + while (n--) /* Pad with nulls */ + *ucs1++ = 0; + return anchor; +} + +/* + * UniStrstr: Find a string in a string + * + * Returns: + * Address of first match found + * NULL if no matching string is found + */ +static inline wchar_t *UniStrstr(const wchar_t *ucs1, const wchar_t *ucs2) +{ + const wchar_t *anchor1 = ucs1; + const wchar_t *anchor2 = ucs2; + + while (*ucs1) { + if (*ucs1 == *ucs2) { + /* Partial match found */ + ucs1++; + ucs2++; + } else { + if (!*ucs2) /* Match found */ + return (wchar_t *)anchor1; + ucs1 = ++anchor1; /* No match */ + ucs2 = anchor2; + } + } + + if (!*ucs2) /* Both end together */ + return (wchar_t *)anchor1; /* Match found */ + return NULL; /* No match */ +} + +#ifndef UNIUPR_NOUPPER +/* + * UniToupper: Convert a unicode character to upper case + */ +static inline wchar_t UniToupper(register wchar_t uc) +{ + register const struct UniCaseRange *rp; + + if (uc < sizeof(NlsUniUpperTable)) { + /* Latin characters */ + return uc + NlsUniUpperTable[uc]; /* Use base tables */ + } + + rp = NlsUniUpperRange; /* Use range tables */ + while (rp->start) { + if (uc < rp->start) /* Before start of range */ + return uc; /* Uppercase = input */ + if (uc <= rp->end) /* In range */ + return uc + rp->table[uc - rp->start]; + rp++; /* Try next range */ + } + return uc; /* Past last range */ +} + +/* + * UniStrupr: Upper case a unicode string + */ +static inline __le16 *UniStrupr(register __le16 *upin) +{ + register __le16 *up; + + up = upin; + while (*up) { /* For all characters */ + *up = cpu_to_le16(UniToupper(le16_to_cpu(*up))); + up++; + } + return upin; /* Return input pointer */ +} +#endif /* UNIUPR_NOUPPER */ + +#endif /* _NLS_UCS2_UTILS_H */ diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 190aa717fa32..ebdcc25df0f7 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -199,7 +199,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) } /* this conversion is done only at watch creation */ -static __u32 convert_arg(unsigned long arg) +static __u32 convert_arg(unsigned int arg) { __u32 new_mask = FS_EVENT_ON_CHILD; @@ -258,7 +258,7 @@ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark, * up here. Allocate both a mark for fsnotify to add and a dnotify_struct to be * attached to the fsnotify_mark. */ -int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) +int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) { struct dnotify_mark *new_dn_mark, *dn_mark; struct fsnotify_mark *new_fsn_mark, *fsn_mark; diff --git a/fs/nsfs.c b/fs/nsfs.c index f602a96a1afe..647a22433bd8 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -84,7 +84,7 @@ slow: return -ENOMEM; } inode->i_ino = ns->inum; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_flags |= S_IMMUTABLE; inode->i_mode = S_IFREG | S_IRUGO; inode->i_fop = &ns_file_operations; diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig index f93e69a61283..7b2509741735 100644 --- a/fs/ntfs/Kconfig +++ b/fs/ntfs/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config NTFS_FS tristate "NTFS file system support" + select BUFFER_HEAD select NLS help NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003. diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 6c3f38d66579..99ac6ea277c4 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -654,7 +654,7 @@ static int ntfs_read_locked_inode(struct inode *vi) * always changes, when mtime is changed. ctime can be changed on its * own, mtime is then not changed, e.g. when a file is renamed. */ - vi->i_ctime = ntfs2utc(si->last_mft_change_time); + inode_set_ctime_to_ts(vi, ntfs2utc(si->last_mft_change_time)); /* * Last access to the data within the file. Not changed during a rename * for example but changed whenever the file is written to. @@ -1218,7 +1218,7 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) vi->i_gid = base_vi->i_gid; set_nlink(vi, base_vi->i_nlink); vi->i_mtime = base_vi->i_mtime; - vi->i_ctime = base_vi->i_ctime; + inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi)); vi->i_atime = base_vi->i_atime; vi->i_generation = ni->seq_no = base_ni->seq_no; @@ -1484,7 +1484,7 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) vi->i_gid = base_vi->i_gid; set_nlink(vi, base_vi->i_nlink); vi->i_mtime = base_vi->i_mtime; - vi->i_ctime = base_vi->i_ctime; + inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi)); vi->i_atime = base_vi->i_atime; vi->i_generation = ni->seq_no = base_ni->seq_no; /* Set inode type to zero but preserve permissions. */ @@ -2804,13 +2804,14 @@ done: */ if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) { struct timespec64 now = current_time(VFS_I(base_ni)); + struct timespec64 ctime = inode_get_ctime(VFS_I(base_ni)); int sync_it = 0; if (!timespec64_equal(&VFS_I(base_ni)->i_mtime, &now) || - !timespec64_equal(&VFS_I(base_ni)->i_ctime, &now)) + !timespec64_equal(&ctime, &now)) sync_it = 1; + inode_set_ctime_to_ts(VFS_I(base_ni), now); VFS_I(base_ni)->i_mtime = now; - VFS_I(base_ni)->i_ctime = now; if (sync_it) mark_inode_dirty_sync(VFS_I(base_ni)); @@ -2928,7 +2929,7 @@ int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (ia_valid & ATTR_MTIME) vi->i_mtime = attr->ia_mtime; if (ia_valid & ATTR_CTIME) - vi->i_ctime = attr->ia_ctime; + inode_set_ctime_to_ts(vi, attr->ia_ctime); mark_inode_dirty(vi); out: return err; @@ -3004,7 +3005,7 @@ int __ntfs_write_inode(struct inode *vi, int sync) si->last_data_change_time = nt; modified = true; } - nt = utc2ntfs(vi->i_ctime); + nt = utc2ntfs(inode_get_ctime(vi)); if (si->last_mft_change_time != nt) { ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, " "new = 0x%llx", vi->i_ino, (long long) diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 0155f106ec34..ad1a8f72da22 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -2682,8 +2682,7 @@ mft_rec_already_initialized: vi->i_mode &= ~S_IWUGO; /* Set the inode times to the current time. */ - vi->i_atime = vi->i_mtime = vi->i_ctime = - current_time(vi); + vi->i_atime = vi->i_mtime = inode_set_ctime_current(vi); /* * Set the file size to 0, the ntfs inode sizes are set to 0 by * the call to ntfs_init_big_inode() below. diff --git a/fs/ntfs3/Kconfig b/fs/ntfs3/Kconfig index 96cc236f7f7b..cdfdf51e55d7 100644 --- a/fs/ntfs3/Kconfig +++ b/fs/ntfs3/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config NTFS3_FS tristate "NTFS Read-Write file system support" + select BUFFER_HEAD select NLS select LEGACY_DIRECT_IO help diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 1d6c824246c4..962f12ce6c0a 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -85,7 +85,7 @@ int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path, stat->attributes_mask |= STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED; - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); stat->result_mask |= STATX_BTIME; stat->btime = ni->i_crtime; @@ -342,7 +342,7 @@ static int ntfs_extend(struct inode *inode, loff_t pos, size_t count, err = 0; } - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); if (IS_SYNC(inode)) { @@ -400,7 +400,7 @@ static int ntfs_truncate(struct inode *inode, loff_t new_size) ni_unlock(ni); ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE; - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); if (!IS_DIRSYNC(inode)) { dirty = 1; } else { @@ -642,7 +642,7 @@ out: filemap_invalidate_unlock(mapping); if (!err) { - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); } diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 16bd9faa2d28..2b85cb10f0be 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -3265,6 +3265,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) if (is_rec_inuse(ni->mi.mrec) && !(sbi->flags & NTFS_FLAGS_LOG_REPLAYING) && inode->i_nlink) { bool modified = false; + struct timespec64 ctime = inode_get_ctime(inode); /* Update times in standard attribute. */ std = ni_std(ni); @@ -3280,7 +3281,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) modified = true; } - dup.c_time = kernel2nt(&inode->i_ctime); + dup.c_time = kernel2nt(&ctime); if (std->c_time != dup.c_time) { std->c_time = dup.c_time; modified = true; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index dc7e7ab701c6..eb2ed0701495 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -44,6 +44,7 @@ static struct inode *ntfs_read_mft(struct inode *inode, u64 t64; struct MFT_REC *rec; struct runs_tree *run; + struct timespec64 ctime; inode->i_op = NULL; /* Setup 'uid' and 'gid' */ @@ -169,7 +170,8 @@ next_attr: nt2kernel(std5->cr_time, &ni->i_crtime); #endif nt2kernel(std5->a_time, &inode->i_atime); - nt2kernel(std5->c_time, &inode->i_ctime); + ctime = inode_get_ctime(inode); + nt2kernel(std5->c_time, &ctime); nt2kernel(std5->m_time, &inode->i_mtime); ni->std_fa = std5->fa; @@ -554,7 +556,7 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, struct super_block *sb = inode->i_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; struct ntfs_inode *ni = ntfs_i(inode); - struct page *page = bh->b_page; + struct folio *folio = bh->b_folio; u8 cluster_bits = sbi->cluster_bits; u32 block_size = sb->s_blocksize; u64 bytes, lbo, valid; @@ -569,7 +571,7 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, if (is_resident(ni)) { ni_lock(ni); - err = attr_data_read_resident(ni, page); + err = attr_data_read_resident(ni, &folio->page); ni_unlock(ni); if (!err) @@ -642,17 +644,17 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, */ bytes = block_size; - if (page) { + if (folio) { u32 voff = valid - vbo; bh->b_size = block_size; off = vbo & (PAGE_SIZE - 1); - set_bh_page(bh, page, off); + folio_set_bh(bh, folio, off); err = bh_read(bh, 0); if (err < 0) goto out; - zero_user_segment(page, off + voff, off + block_size); + folio_zero_segment(folio, off + voff, off + block_size); } } @@ -958,7 +960,7 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, if (err >= 0) { if (!(ni->std_fa & FILE_ATTRIBUTE_ARCHIVE)) { - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE; dirty = true; } @@ -1658,8 +1660,8 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, d_instantiate(dentry, inode); /* Set original time. inode times (i_ctime) may be changed in ntfs_init_acl. */ - inode->i_atime = inode->i_mtime = inode->i_ctime = dir->i_mtime = - dir->i_ctime = ni->i_crtime; + inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode, ni->i_crtime); + dir->i_mtime = inode_set_ctime_to_ts(dir, ni->i_crtime); mark_inode_dirty(dir); mark_inode_dirty(inode); @@ -1765,9 +1767,9 @@ int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry) if (!err) { drop_nlink(inode); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); if (inode->i_nlink) mark_inode_dirty(inode); } else if (!ni_remove_name_undo(dir_ni, ni, de, de2, undo_remove)) { diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index 70f8c859e0ad..ad430d50bd79 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -156,8 +156,8 @@ static int ntfs_link(struct dentry *ode, struct inode *dir, struct dentry *de) err = ntfs_link_inode(inode, de); if (!err) { - dir->i_ctime = dir->i_mtime = inode->i_ctime = - current_time(dir); + dir->i_mtime = inode_set_ctime_to_ts(inode, + inode_set_ctime_current(dir)); mark_inode_dirty(inode); mark_inode_dirty(dir); d_instantiate(de, inode); @@ -324,14 +324,11 @@ static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir, /* Restore after failed rename failed too. */ _ntfs_bad_inode(inode); } else if (!err) { - inode->i_ctime = dir->i_ctime = dir->i_mtime = - current_time(dir); + simple_rename_timestamp(dir, dentry, new_dir, new_dentry); mark_inode_dirty(inode); mark_inode_dirty(dir); - if (dir != new_dir) { - new_dir->i_mtime = new_dir->i_ctime = dir->i_ctime; + if (dir != new_dir) mark_inode_dirty(new_dir); - } if (IS_DIRSYNC(dir)) ntfs_sync_inode(dir); diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 1a02072b6b0e..5661a363005e 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -569,14 +569,10 @@ static void init_once(void *foo) } /* - * put_ntfs - Noinline to reduce binary size. + * Noinline to reduce binary size. */ -static noinline void put_ntfs(struct ntfs_sb_info *sbi) +static noinline void ntfs3_put_sbi(struct ntfs_sb_info *sbi) { - kfree(sbi->new_rec); - kvfree(ntfs_put_shared(sbi->upcase)); - kfree(sbi->def_table); - wnd_close(&sbi->mft.bitmap); wnd_close(&sbi->used.bitmap); @@ -601,6 +597,13 @@ static noinline void put_ntfs(struct ntfs_sb_info *sbi) indx_clear(&sbi->security.index_sdh); indx_clear(&sbi->reparse.index_r); indx_clear(&sbi->objid.index_o); +} + +static void ntfs3_free_sbi(struct ntfs_sb_info *sbi) +{ + kfree(sbi->new_rec); + kvfree(ntfs_put_shared(sbi->upcase)); + kfree(sbi->def_table); kfree(sbi->compress.lznt); #ifdef CONFIG_NTFS3_LZX_XPRESS xpress_free_decompressor(sbi->compress.xpress); @@ -625,12 +628,7 @@ static void ntfs_put_super(struct super_block *sb) /* Mark rw ntfs as clear, if possible. */ ntfs_set_state(sbi, NTFS_DIRTY_CLEAR); - - put_mount_options(sbi->options); - put_ntfs(sbi); - sb->s_fs_info = NULL; - - sync_blockdev(sb->s_bdev); + ntfs3_put_sbi(sbi); } static int ntfs_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -1564,15 +1562,8 @@ load_root: put_inode_out: iput(inode); out: - /* - * Free resources here. - * ntfs_fs_free will be called with fc->s_fs_info = NULL - */ - put_mount_options(sbi->options); - put_ntfs(sbi); - sb->s_fs_info = NULL; + ntfs3_put_sbi(sbi); kfree(boot2); - return err; } @@ -1658,8 +1649,10 @@ static void ntfs_fs_free(struct fs_context *fc) struct ntfs_mount_options *opts = fc->fs_private; struct ntfs_sb_info *sbi = fc->s_fs_info; - if (sbi) - put_ntfs(sbi); + if (sbi) { + ntfs3_put_sbi(sbi); + ntfs3_free_sbi(sbi); + } if (opts) put_mount_options(opts); @@ -1728,13 +1721,24 @@ free_opts: return -ENOMEM; } +static void ntfs3_kill_sb(struct super_block *sb) +{ + struct ntfs_sb_info *sbi = sb->s_fs_info; + + kill_block_super(sb); + + if (sbi->options) + put_mount_options(sbi->options); + ntfs3_free_sbi(sbi); +} + // clang-format off static struct file_system_type ntfs_fs_type = { .owner = THIS_MODULE, .name = "ntfs3", .init_fs_context = ntfs_init_fs_context, .parameters = ntfs_fs_parameters, - .kill_sb = kill_block_super, + .kill_sb = ntfs3_kill_sb, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; // clang-format on diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 023f314e8950..29fd391899e5 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -637,7 +637,7 @@ static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap, if (!err) { set_cached_acl(inode, type, acl); inode->i_mode = mode; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); } @@ -924,7 +924,7 @@ set_new_fa: NULL); out: - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); return err; diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig index 3123da7cfb30..2514d36cbe01 100644 --- a/fs/ocfs2/Kconfig +++ b/fs/ocfs2/Kconfig @@ -2,6 +2,7 @@ config OCFS2_FS tristate "OCFS2 file system support" depends on INET && SYSFS && CONFIGFS_FS + select BUFFER_HEAD select JBD2 select CRC32 select QUOTA diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 9fd03eaf15f8..e75137a8e7cb 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -191,10 +191,10 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh, } inode->i_mode = new_mode; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); di->i_mode = cpu_to_le16(inode->i_mode); - di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 51c93929a146..aef58f1395c8 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7436,10 +7436,10 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, } inode->i_blocks = ocfs2_inode_sector_count(inode); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); - di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(inode).tv_sec); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 8dfc284e85f0..0fdba30740ab 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2048,7 +2048,7 @@ out_write_size: } inode->i_blocks = ocfs2_inode_sector_count(inode); di->i_size = cpu_to_le64((u64)i_size_read(inode)); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); if (handle) diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 35c05c18de59..bc27301eab6d 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -44,17 +44,17 @@ static LIST_HEAD(send_tracking); void o2net_debug_add_nst(struct o2net_send_tracking *nst) { - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); list_add(&nst->st_net_debug_item, &send_tracking); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); } void o2net_debug_del_nst(struct o2net_send_tracking *nst) { - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); if (!list_empty(&nst->st_net_debug_item)) list_del_init(&nst->st_net_debug_item); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); } static struct o2net_send_tracking @@ -84,9 +84,9 @@ static void *nst_seq_start(struct seq_file *seq, loff_t *pos) { struct o2net_send_tracking *nst, *dummy_nst = seq->private; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); nst = next_nst(dummy_nst); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return nst; } @@ -95,13 +95,13 @@ static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct o2net_send_tracking *nst, *dummy_nst = seq->private; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); nst = next_nst(dummy_nst); list_del_init(&dummy_nst->st_net_debug_item); if (nst) list_add(&dummy_nst->st_net_debug_item, &nst->st_net_debug_item); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return nst; /* unused, just needs to be null when done */ } @@ -112,7 +112,7 @@ static int nst_seq_show(struct seq_file *seq, void *v) ktime_t now; s64 sock, send, status; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); nst = next_nst(dummy_nst); if (!nst) goto out; @@ -145,7 +145,7 @@ static int nst_seq_show(struct seq_file *seq, void *v) (long long)status); out: - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return 0; } @@ -191,16 +191,16 @@ static const struct file_operations nst_seq_fops = { void o2net_debug_add_sc(struct o2net_sock_container *sc) { - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); list_add(&sc->sc_net_debug_item, &sock_containers); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); } void o2net_debug_del_sc(struct o2net_sock_container *sc) { - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); list_del_init(&sc->sc_net_debug_item); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); } struct o2net_sock_debug { @@ -236,9 +236,9 @@ static void *sc_seq_start(struct seq_file *seq, loff_t *pos) struct o2net_sock_debug *sd = seq->private; struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); sc = next_sc(dummy_sc); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return sc; } @@ -248,12 +248,12 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct o2net_sock_debug *sd = seq->private; struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); sc = next_sc(dummy_sc); list_del_init(&dummy_sc->sc_net_debug_item); if (sc) list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return sc; /* unused, just needs to be null when done */ } @@ -349,7 +349,7 @@ static int sc_seq_show(struct seq_file *seq, void *v) struct o2net_sock_debug *sd = seq->private; struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); sc = next_sc(dummy_sc); if (sc) { @@ -359,7 +359,7 @@ static int sc_seq_show(struct seq_file *seq, void *v) sc_show_sock_stats(seq, sc); } - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return 0; } diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 189c111bc371..15d0ed9c13e5 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -93,7 +93,7 @@ static void o2quo_make_decision(struct work_struct *work) int lowest_hb, lowest_reachable = 0, fence = 0; struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES); if (lowest_hb != O2NM_MAX_NODES) @@ -146,14 +146,14 @@ static void o2quo_make_decision(struct work_struct *work) out: if (fence) { - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); o2quo_fence_self(); } else { mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, " "connected: %d, lowest: %d (%sreachable)\n", qs->qs_heartbeating, qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un"); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } @@ -196,7 +196,7 @@ void o2quo_hb_up(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); qs->qs_heartbeating++; mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES, @@ -211,7 +211,7 @@ void o2quo_hb_up(u8 node) else o2quo_clear_hold(qs, node); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } /* hb going down releases any holds we might have had due to this node from @@ -220,7 +220,7 @@ void o2quo_hb_down(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); qs->qs_heartbeating--; mlog_bug_on_msg(qs->qs_heartbeating < 0, @@ -233,7 +233,7 @@ void o2quo_hb_down(u8 node) o2quo_clear_hold(qs, node); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } /* this tells us that we've decided that the node is still heartbeating @@ -245,14 +245,14 @@ void o2quo_hb_still_up(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); mlog(0, "node %u\n", node); qs->qs_pending = 1; o2quo_clear_hold(qs, node); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } /* This is analogous to hb_up. as a node's connection comes up we delay the @@ -264,7 +264,7 @@ void o2quo_conn_up(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); qs->qs_connected++; mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES, @@ -279,7 +279,7 @@ void o2quo_conn_up(u8 node) else o2quo_clear_hold(qs, node); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } /* we've decided that we won't ever be connecting to the node again. if it's @@ -290,7 +290,7 @@ void o2quo_conn_err(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); if (test_bit(node, qs->qs_conn_bm)) { qs->qs_connected--; @@ -307,7 +307,7 @@ void o2quo_conn_err(u8 node) mlog(0, "node %u, %d total\n", node, qs->qs_connected); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } void o2quo_init(void) diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 694471fc46b8..8b123d543e6e 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1658,7 +1658,7 @@ int __ocfs2_add_entry(handle_t *handle, offset, ocfs2_dir_trailer_blk_off(dir->i_sb)); if (ocfs2_dirent_would_fit(de, rec_len)) { - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); if (retval < 0) { mlog_errno(retval); @@ -2962,11 +2962,11 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, ocfs2_dinode_new_extent_list(dir, di); i_size_write(dir, sb->s_blocksize); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); di->i_size = cpu_to_le64(sb->s_blocksize); - di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec); - di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec); + di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(dir).tv_sec); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(dir).tv_nsec); ocfs2_update_inode_fsync_trans(handle, dir, 1); /* diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index ba26c5567cff..81265123ce6c 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -337,7 +337,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb) if (inode) { inode->i_ino = get_next_ino(); inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); inc_nlink(inode); inode->i_fop = &simple_dir_operations; @@ -360,7 +360,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent, inode->i_ino = get_next_ino(); inode_init_owner(&nop_mnt_idmap, inode, parent, mode); - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); ip = DLMFS_I(inode); ip->ip_conn = DLMFS_I(parent)->ip_conn; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index c28bc983a7b1..c3e2961ee5db 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2162,6 +2162,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; + struct timespec64 ctime = inode_get_ctime(inode); lvb = ocfs2_dlm_lvb(&lockres->l_lksb); @@ -2185,7 +2186,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) lvb->lvb_iatime_packed = cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime)); lvb->lvb_ictime_packed = - cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime)); + cpu_to_be64(ocfs2_pack_timespec(&ctime)); lvb->lvb_imtime_packed = cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); lvb->lvb_iattr = cpu_to_be32(oi->ip_attr); @@ -2208,6 +2209,7 @@ static int ocfs2_refresh_inode_from_lvb(struct inode *inode) struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; + struct timespec64 ctime; mlog_meta_lvb(0, lockres); @@ -2238,8 +2240,9 @@ static int ocfs2_refresh_inode_from_lvb(struct inode *inode) be64_to_cpu(lvb->lvb_iatime_packed)); ocfs2_unpack_timespec(&inode->i_mtime, be64_to_cpu(lvb->lvb_imtime_packed)); - ocfs2_unpack_timespec(&inode->i_ctime, + ocfs2_unpack_timespec(&ctime, be64_to_cpu(lvb->lvb_ictime_packed)); + inode_set_ctime_to_ts(inode, ctime); spin_unlock(&oi->ip_lock); return 0; } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index bf2c17ea96a0..c45596c25c66 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -232,8 +232,10 @@ int ocfs2_should_update_atime(struct inode *inode, return 0; if (vfsmnt->mnt_flags & MNT_RELATIME) { + struct timespec64 ctime = inode_get_ctime(inode); + if ((timespec64_compare(&inode->i_atime, &inode->i_mtime) <= 0) || - (timespec64_compare(&inode->i_atime, &inode->i_ctime) <= 0)) + (timespec64_compare(&inode->i_atime, &ctime) <= 0)) return 1; return 0; @@ -294,7 +296,7 @@ int ocfs2_set_inode_size(handle_t *handle, i_size_write(inode, new_i_size); inode->i_blocks = ocfs2_inode_sector_count(inode); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); if (status < 0) { @@ -415,12 +417,12 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, } i_size_write(inode, new_i_size); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); di = (struct ocfs2_dinode *) fe_bh->b_data; di->i_size = cpu_to_le64(new_i_size); - di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(inode).tv_sec); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, fe_bh); @@ -808,12 +810,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, /* must not update i_size! */ - ret = block_commit_write(page, block_start + 1, - block_start + 1); - if (ret < 0) - mlog_errno(ret); - else - ret = 0; + block_commit_write(page, block_start + 1, block_start + 1); } /* @@ -824,7 +821,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, i_size_write(inode, abs_to); inode->i_blocks = ocfs2_inode_sector_count(inode); di->i_size = cpu_to_le64((u64)i_size_read(inode)); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); di->i_mtime_nsec = di->i_ctime_nsec; @@ -1317,7 +1314,7 @@ int ocfs2_getattr(struct mnt_idmap *idmap, const struct path *path, goto bail; } - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); /* * If there is inline data in the inode, the inode will normally not * have data blocks allocated (it may have an external xattr block). @@ -2043,7 +2040,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, goto out_inode_unlock; } - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); if (ret < 0) mlog_errno(ret); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index bb116c39b581..e8771600b930 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -306,8 +306,8 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); - inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); - inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); + inode_set_ctime(inode, le64_to_cpu(fe->i_ctime), + le32_to_cpu(fe->i_ctime_nsec)); if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno)) mlog(ML_ERROR, @@ -1314,8 +1314,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle, fe->i_mode = cpu_to_le16(inode->i_mode); fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec); fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); - fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + fe->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); + fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); @@ -1352,8 +1352,8 @@ void ocfs2_refresh_inode(struct inode *inode, inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); - inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); - inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); + inode_set_ctime(inode, le64_to_cpu(fe->i_ctime), + le32_to_cpu(fe->i_ctime_nsec)); spin_unlock(&OCFS2_I(inode)->ip_lock); } diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 25d8072ccfce..ce215565d061 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -114,9 +114,9 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb) if (osb->replay_map) return 0; - replay_map = kzalloc(sizeof(struct ocfs2_replay_map) + - (osb->max_slots * sizeof(char)), GFP_KERNEL); - + replay_map = kzalloc(struct_size(replay_map, rm_replay_slots, + osb->max_slots), + GFP_KERNEL); if (!replay_map) { mlog_errno(-ENOMEM); return -ENOMEM; @@ -178,16 +178,13 @@ int ocfs2_recovery_init(struct ocfs2_super *osb) osb->recovery_thread_task = NULL; init_waitqueue_head(&osb->recovery_event); - rm = kzalloc(sizeof(struct ocfs2_recovery_map) + - osb->max_slots * sizeof(unsigned int), + rm = kzalloc(struct_size(rm, rm_entries, osb->max_slots), GFP_KERNEL); if (!rm) { mlog_errno(-ENOMEM); return -ENOMEM; } - rm->rm_entries = (unsigned int *)((char *)rm + - sizeof(struct ocfs2_recovery_map)); osb->recovery_map = rm; return 0; @@ -557,7 +554,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers, (unsigned long)bh, (unsigned long long)bh->b_blocknr); - ocfs2_error(bh->b_bdev->bd_super, + ocfs2_error(bh->b_assoc_map->host->i_sb, "JBD2 has aborted our journal, ocfs2 cannot continue\n"); } @@ -780,14 +777,14 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh) mlog_errno(status); if (!is_handle_aborted(handle)) { journal_t *journal = handle->h_transaction->t_journal; - struct super_block *sb = bh->b_bdev->bd_super; mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed. " "Aborting transaction and journal.\n"); handle->h_err = status; jbd2_journal_abort_handle(handle); jbd2_journal_abort(journal, status); - ocfs2_abort(sb, "Journal already aborted.\n"); + ocfs2_abort(bh->b_assoc_map->host->i_sb, + "Journal already aborted.\n"); } } } @@ -911,9 +908,9 @@ int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) /* call the kernels journal init function now */ j_journal = jbd2_journal_init_inode(inode); - if (j_journal == NULL) { + if (IS_ERR(j_journal)) { mlog(ML_ERROR, "Linux journal layer error\n"); - status = -EINVAL; + status = PTR_ERR(j_journal); goto done; } @@ -1687,9 +1684,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, } journal = jbd2_journal_init_inode(inode); - if (journal == NULL) { + if (IS_ERR(journal)) { mlog(ML_ERROR, "Linux journal layer error\n"); - status = -EIO; + status = PTR_ERR(journal); goto done; } diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 41c382f68529..41c9fe7e62f9 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -29,7 +29,7 @@ struct ocfs2_dinode; struct ocfs2_recovery_map { unsigned int rm_used; - unsigned int *rm_entries; + unsigned int rm_entries[]; }; diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index b1e32ec4a9d4..05d67968a3a9 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -950,9 +950,9 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) } di = (struct ocfs2_dinode *)di_bh->b_data; - inode->i_ctime = current_time(inode); - di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + inode_set_ctime_current(inode); + di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 17c52225b87d..5cd6d7771cea 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -793,10 +793,10 @@ static int ocfs2_link(struct dentry *old_dentry, } inc_nlink(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); ocfs2_set_links_count(fe, inode->i_nlink); - fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + fe->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); + fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); ocfs2_journal_dirty(handle, fe_bh); err = ocfs2_add_entry(handle, dentry, inode, @@ -995,7 +995,7 @@ static int ocfs2_unlink(struct inode *dir, ocfs2_set_links_count(fe, inode->i_nlink); ocfs2_journal_dirty(handle, fe_bh); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); if (S_ISDIR(inode->i_mode)) drop_nlink(dir); @@ -1535,9 +1535,13 @@ static int ocfs2_rename(struct mnt_idmap *idmap, status = ocfs2_add_entry(handle, new_dentry, old_inode, OCFS2_I(old_inode)->ip_blkno, new_dir_bh, &target_insert); + if (status < 0) { + mlog_errno(status); + goto bail; + } } - old_inode->i_ctime = current_time(old_inode); + inode_set_ctime_current(old_inode); mark_inode_dirty(old_inode); status = ocfs2_journal_access_di(handle, INODE_CACHE(old_inode), @@ -1546,8 +1550,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap, if (status >= 0) { old_di = (struct ocfs2_dinode *) old_inode_bh->b_data; - old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec); - old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec); + old_di->i_ctime = cpu_to_le64(inode_get_ctime(old_inode).tv_sec); + old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(old_inode).tv_nsec); ocfs2_journal_dirty(handle, old_inode_bh); } else mlog_errno(status); @@ -1586,9 +1590,9 @@ static int ocfs2_rename(struct mnt_idmap *idmap, if (new_inode) { drop_nlink(new_inode); - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); } - old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); + old_dir->i_mtime = inode_set_ctime_current(old_dir); if (update_dot_dot) { status = ocfs2_update_entry(old_inode, handle, @@ -1610,7 +1614,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap, if (old_dir != new_dir) { /* Keep the same times on both directories.*/ - new_dir->i_ctime = new_dir->i_mtime = old_dir->i_ctime; + new_dir->i_mtime = inode_set_ctime_to_ts(new_dir, + inode_get_ctime(old_dir)); /* * This will also pick up the i_nlink change from the diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 564ab48d03ef..25c8ec3c8c3a 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3750,9 +3750,9 @@ static int ocfs2_change_ctime(struct inode *inode, goto out_commit; } - inode->i_ctime = current_time(inode); - di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + inode_set_ctime_current(inode); + di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); ocfs2_journal_dirty(handle, di_bh); @@ -4073,10 +4073,10 @@ static int ocfs2_complete_reflink(struct inode *s_inode, * we want mtime to appear identical to the source and * update ctime. */ - t_inode->i_ctime = current_time(t_inode); + inode_set_ctime_current(t_inode); - di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec); + di->i_ctime = cpu_to_le64(inode_get_ctime(t_inode).tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(t_inode).tv_nsec); t_inode->i_mtime = s_inode->i_mtime; di->i_mtime = s_di->i_mtime; @@ -4456,7 +4456,7 @@ int ocfs2_reflink_update_dest(struct inode *dest, if (newlen > i_size_read(dest)) i_size_write(dest, newlen); spin_unlock(&OCFS2_I(dest)->ip_lock); - dest->i_ctime = dest->i_mtime = current_time(dest); + dest->i_mtime = inode_set_ctime_current(dest); ret = ocfs2_mark_inode_dirty(handle, dest, d_bh); if (ret) { diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 05d4414d0c33..9b76ee66aeb2 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -738,18 +738,11 @@ static int user_plock(struct ocfs2_cluster_connection *conn, * * Internally, fs/dlm will pass these to a misc device, which * a userspace daemon will read and write to. - * - * For now, cancel requests (which happen internally only), - * are turned into unlocks. Most of this function taken from - * gfs2_lock. */ - if (cmd == F_CANCELLK) { - cmd = F_SETLK; - fl->fl_type = F_UNLCK; - } - - if (IS_GETLK(cmd)) + if (cmd == F_CANCELLK) + return dlm_posix_cancel(conn->cc_lockspace, ino, file, fl); + else if (IS_GETLK(cmd)) return dlm_posix_get(conn->cc_lockspace, ino, file, fl); else if (fl->fl_type == F_UNLCK) return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 988d1c076861..6b906424902b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1517,8 +1517,7 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",localflocks,"); if (osb->osb_cluster_stack[0]) - seq_show_option_n(s, "cluster_stack", osb->osb_cluster_stack, - OCFS2_STACK_LABEL_LEN); + seq_show_option(s, "cluster_stack", osb->osb_cluster_stack); if (opts & OCFS2_MOUNT_USRQUOTA) seq_printf(s, ",usrquota"); if (opts & OCFS2_MOUNT_GRPQUOTA) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 4ac77ff6e676..6510ad783c91 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -3421,9 +3421,9 @@ static int __ocfs2_xattr_set_handle(struct inode *inode, goto out; } - inode->i_ctime = current_time(inode); - di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + inode_set_ctime_current(inode); + di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); ocfs2_journal_dirty(ctxt->handle, xis->inode_bh); } out: diff --git a/fs/omfs/Kconfig b/fs/omfs/Kconfig index 42b2ec35a05b..8470f6c3e64e 100644 --- a/fs/omfs/Kconfig +++ b/fs/omfs/Kconfig @@ -2,6 +2,7 @@ config OMFS_FS tristate "SonicBlue Optimized MPEG File System support" depends on BLOCK + select BUFFER_HEAD select CRC_ITU_T help This is the proprietary file system used by the Rio Karma music diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index 82cf7e9a665f..6bda275826d6 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -143,7 +143,7 @@ static int omfs_add_link(struct dentry *dentry, struct inode *inode) mark_buffer_dirty(bh); brelse(bh); - dir->i_ctime = current_time(dir); + inode_set_ctime_current(dir); /* mark affected inodes dirty to rebuild checksums */ mark_inode_dirty(dir); @@ -399,7 +399,7 @@ static int omfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (err) goto out; - old_inode->i_ctime = current_time(old_inode); + inode_set_ctime_current(old_inode); mark_inode_dirty(old_inode); out: return err; diff --git a/fs/omfs/file.c b/fs/omfs/file.c index de8f57ee39ec..6b580b9da8e3 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -14,7 +14,7 @@ static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset) { return (sbi->s_sys_blocksize - offset - sizeof(struct omfs_extent)) / - sizeof(struct omfs_extent_entry) + 1; + sizeof(struct omfs_extent_entry); } void omfs_make_empty_table(struct buffer_head *bh, int offset) @@ -24,8 +24,8 @@ void omfs_make_empty_table(struct buffer_head *bh, int offset) oe->e_next = ~cpu_to_be64(0ULL); oe->e_extent_count = cpu_to_be32(1), oe->e_fill = cpu_to_be32(0x22), - oe->e_entry.e_cluster = ~cpu_to_be64(0ULL); - oe->e_entry.e_blocks = ~cpu_to_be64(0ULL); + oe->e_entry[0].e_cluster = ~cpu_to_be64(0ULL); + oe->e_entry[0].e_blocks = ~cpu_to_be64(0ULL); } int omfs_shrink_inode(struct inode *inode) @@ -68,7 +68,7 @@ int omfs_shrink_inode(struct inode *inode) last = next; next = be64_to_cpu(oe->e_next); - entry = &oe->e_entry; + entry = oe->e_entry; /* ignore last entry as it is the terminator */ for (; extent_count > 1; extent_count--) { @@ -117,7 +117,7 @@ static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe, u64 *ret_block) { struct omfs_extent_entry *terminator; - struct omfs_extent_entry *entry = &oe->e_entry; + struct omfs_extent_entry *entry = oe->e_entry; struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); u32 extent_count = be32_to_cpu(oe->e_extent_count); u64 new_block = 0; @@ -245,7 +245,7 @@ static int omfs_get_block(struct inode *inode, sector_t block, extent_count = be32_to_cpu(oe->e_extent_count); next = be64_to_cpu(oe->e_next); - entry = &oe->e_entry; + entry = oe->e_entry; if (extent_count > max_extents) goto out_brelse; diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index c4c79e07efc7..2f8c1882f45c 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -51,7 +51,7 @@ struct inode *omfs_new_inode(struct inode *dir, umode_t mode) inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); inode->i_mapping->a_ops = &omfs_aops; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); switch (mode & S_IFMT) { case S_IFDIR: inode->i_op = &omfs_dir_inops; @@ -134,8 +134,8 @@ static int __omfs_write_inode(struct inode *inode, int wait) oi->i_head.h_magic = OMFS_IMAGIC; oi->i_size = cpu_to_be64(inode->i_size); - ctime = inode->i_ctime.tv_sec * 1000LL + - ((inode->i_ctime.tv_nsec + 999)/1000); + ctime = inode_get_ctime(inode).tv_sec * 1000LL + + ((inode_get_ctime(inode).tv_nsec + 999)/1000); oi->i_ctime = cpu_to_be64(ctime); omfs_update_checksums(oi); @@ -232,10 +232,9 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino) inode->i_atime.tv_sec = ctime; inode->i_mtime.tv_sec = ctime; - inode->i_ctime.tv_sec = ctime; + inode_set_ctime(inode, ctime, nsecs); inode->i_atime.tv_nsec = nsecs; inode->i_mtime.tv_nsec = nsecs; - inode->i_ctime.tv_nsec = nsecs; inode->i_mapping->a_ops = &omfs_aops; diff --git a/fs/omfs/omfs_fs.h b/fs/omfs/omfs_fs.h index caecb3d5a344..1ff6b9e41297 100644 --- a/fs/omfs/omfs_fs.h +++ b/fs/omfs/omfs_fs.h @@ -77,7 +77,7 @@ struct omfs_extent { __be64 e_next; /* next extent table location */ __be32 e_extent_count; /* total # extents in this table */ __be32 e_fill; - struct omfs_extent_entry e_entry; /* start of extent entries */ + struct omfs_extent_entry e_entry[]; /* start of extent entries */ }; #endif diff --git a/fs/open.c b/fs/open.c index e6ead0f19964..98f6601fbac6 100644 --- a/fs/open.c +++ b/fs/open.c @@ -671,11 +671,20 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) return err; } -static int do_fchmodat(int dfd, const char __user *filename, umode_t mode) +static int do_fchmodat(int dfd, const char __user *filename, umode_t mode, + unsigned int flags) { struct path path; int error; - unsigned int lookup_flags = LOOKUP_FOLLOW; + unsigned int lookup_flags; + + if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))) + return -EINVAL; + + lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (!error) { @@ -689,15 +698,21 @@ retry: return error; } +SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename, + umode_t, mode, unsigned int, flags) +{ + return do_fchmodat(dfd, filename, mode, flags); +} + SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode) { - return do_fchmodat(dfd, filename, mode); + return do_fchmodat(dfd, filename, mode, 0); } SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) { - return do_fchmodat(AT_FDCWD, filename, mode); + return do_fchmodat(AT_FDCWD, filename, mode, 0); } /* @@ -1150,7 +1165,7 @@ EXPORT_SYMBOL_GPL(kernel_file_open); * backing_file_open - open a backing file for kernel internal use * @path: path of the file to open * @flags: open flags - * @path: path of the backing file + * @real_path: path of the backing file * @cred: credentials for open * * Open a backing file for a stackable filesystem (e.g., overlayfs). @@ -1503,7 +1518,7 @@ SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode) * "id" is the POSIX thread ID. We use the * files pointer for this.. */ -int filp_close(struct file *filp, fl_owner_t id) +static int filp_flush(struct file *filp, fl_owner_t id) { int retval = 0; @@ -1520,10 +1535,18 @@ int filp_close(struct file *filp, fl_owner_t id) dnotify_flush(filp, id); locks_remove_posix(filp, id); } - fput(filp); return retval; } +int filp_close(struct file *filp, fl_owner_t id) +{ + int retval; + + retval = filp_flush(filp, id); + fput(filp); + + return retval; +} EXPORT_SYMBOL(filp_close); /* @@ -1533,7 +1556,20 @@ EXPORT_SYMBOL(filp_close); */ SYSCALL_DEFINE1(close, unsigned int, fd) { - int retval = close_fd(fd); + int retval; + struct file *file; + + file = close_fd_get_file(fd); + if (!file) + return -EBADF; + + retval = filp_flush(file, current->files); + + /* + * We're returning to user space. Don't bother + * with any delayed fput() cases. + */ + __fput_sync(file); /* can't restart close syscall because file table entry was cleared */ if (unlikely(retval == -ERESTARTSYS || @@ -1546,7 +1582,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd) } /** - * close_range() - Close all file descriptors in a given range. + * sys_close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index f0b7f4d51a17..b2457cb97fa0 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -237,7 +237,7 @@ found: if (IS_ERR(inode)) return ERR_CAST(inode); if (inode->i_state & I_NEW) { - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); ent_oi = OP_I(inode); ent_oi->type = ent_type; ent_oi->u = ent_data; @@ -387,8 +387,7 @@ static int openprom_fill_super(struct super_block *s, struct fs_context *fc) goto out_no_root; } - root_inode->i_mtime = root_inode->i_atime = - root_inode->i_ctime = current_time(root_inode); + root_inode->i_mtime = root_inode->i_atime = inode_set_ctime_current(root_inode); root_inode->i_op = &openprom_inode_operations; root_inode->i_fop = &openprom_operations; root_inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 9014bbcc8031..085912268442 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -871,7 +871,7 @@ int orangefs_getattr(struct mnt_idmap *idmap, const struct path *path, ret = orangefs_inode_getattr(inode, request_mask & STATX_SIZE ? ORANGEFS_GETATTR_SIZE : 0); if (ret == 0) { - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); /* override block size reported to stat */ if (!(request_mask & STATX_SIZE)) @@ -900,12 +900,13 @@ int orangefs_permission(struct mnt_idmap *idmap, return generic_permission(&nop_mnt_idmap, inode, mask); } -int orangefs_update_time(struct inode *inode, struct timespec64 *time, int flags) +int orangefs_update_time(struct inode *inode, int flags) { struct iattr iattr; + gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_update_time: %pU\n", get_khandle_from_ino(inode)); - generic_update_time(inode, time, flags); + flags = generic_update_time(inode, flags); memset(&iattr, 0, sizeof iattr); if (flags & S_ATIME) iattr.ia_valid |= ATTR_ATIME; diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 77518e248cf7..c9dfd5c6a097 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -421,7 +421,7 @@ static int orangefs_rename(struct mnt_idmap *idmap, ret); if (new_dentry->d_inode) - new_dentry->d_inode->i_ctime = current_time(new_dentry->d_inode); + inode_set_ctime_current(d_inode(new_dentry)); op_release(new_op); return ret; diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index ce20d3443869..b711654ca18a 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -370,7 +370,7 @@ int orangefs_getattr(struct mnt_idmap *idmap, const struct path *path, int orangefs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); -int orangefs_update_time(struct inode *, struct timespec64 *, int); +int orangefs_update_time(struct inode *, int); /* * defined in xattr.c diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index 46b7dcff18ac..0a9fcfdf552f 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -361,11 +361,11 @@ again2: downcall.resp.getattr.attributes.atime; inode->i_mtime.tv_sec = (time64_t)new_op-> downcall.resp.getattr.attributes.mtime; - inode->i_ctime.tv_sec = (time64_t)new_op-> - downcall.resp.getattr.attributes.ctime; + inode_set_ctime(inode, + (time64_t)new_op->downcall.resp.getattr.attributes.ctime, + 0); inode->i_atime.tv_nsec = 0; inode->i_mtime.tv_nsec = 0; - inode->i_ctime.tv_nsec = 0; /* special case: mark the root inode as sticky */ inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) | diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 6708e54b0e30..fec5020c3495 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -124,3 +124,12 @@ config OVERLAY_FS_METACOPY that doesn't support this feature will have unexpected results. If unsure, say N. + +config OVERLAY_FS_DEBUG + bool "Overlayfs: turn on extra debugging checks" + default n + depends on OVERLAY_FS + help + Say Y here to enable extra debugging checks in overlayfs. + + If unsure, say N. diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 568f743a5584..ada3fcc9c6d5 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -337,7 +337,7 @@ static int ovl_set_timestamps(struct ovl_fs *ofs, struct dentry *upperdentry, { struct iattr attr = { .ia_valid = - ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET, + ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_CTIME, .ia_atime = stat->atime, .ia_mtime = stat->mtime, }; @@ -416,7 +416,7 @@ struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, if (is_upper) fh->fb.flags |= OVL_FH_FLAG_PATH_UPPER; fh->fb.len = sizeof(fh->fb) + buflen; - if (ofs->config.uuid) + if (ovl_origin_uuid(ofs)) fh->fb.uuid = *uuid; return fh; @@ -544,6 +544,7 @@ struct ovl_copy_up_ctx { bool origin; bool indexed; bool metacopy; + bool metacopy_digest; }; static int ovl_link_up(struct ovl_copy_up_ctx *c) @@ -617,7 +618,8 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp) if (err) return err; - if (inode->i_flags & OVL_COPY_I_FLAGS_MASK) { + if (inode->i_flags & OVL_COPY_I_FLAGS_MASK && + (S_ISREG(c->stat.mode) || S_ISDIR(c->stat.mode))) { /* * Copy the fileattr inode flags that are the source of already * copied i_flags @@ -641,8 +643,20 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp) } if (c->metacopy) { - err = ovl_check_setxattr(ofs, temp, OVL_XATTR_METACOPY, - NULL, 0, -EOPNOTSUPP); + struct path lowerdatapath; + struct ovl_metacopy metacopy_data = OVL_METACOPY_INIT; + + ovl_path_lowerdata(c->dentry, &lowerdatapath); + if (WARN_ON_ONCE(lowerdatapath.dentry == NULL)) + return -EIO; + err = ovl_get_verity_digest(ofs, &lowerdatapath, &metacopy_data); + if (err) + return err; + + if (metacopy_data.digest_algo) + c->metacopy_digest = true; + + err = ovl_set_metacopy_xattr(ofs, temp, &metacopy_data); if (err) return err; } @@ -751,9 +765,15 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) if (err) goto cleanup; - if (!c->metacopy) - ovl_set_upperdata(d_inode(c->dentry)); inode = d_inode(c->dentry); + if (c->metacopy_digest) + ovl_set_flag(OVL_HAS_DIGEST, inode); + else + ovl_clear_flag(OVL_HAS_DIGEST, inode); + ovl_clear_flag(OVL_VERIFIED_DIGEST, inode); + + if (!c->metacopy) + ovl_set_upperdata(inode); ovl_inode_update(inode, temp); if (S_ISDIR(inode->i_mode)) ovl_set_flag(OVL_WHITEOUTS, inode); @@ -813,6 +833,12 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) if (err) goto out_fput; + if (c->metacopy_digest) + ovl_set_flag(OVL_HAS_DIGEST, d_inode(c->dentry)); + else + ovl_clear_flag(OVL_HAS_DIGEST, d_inode(c->dentry)); + ovl_clear_flag(OVL_VERIFIED_DIGEST, d_inode(c->dentry)); + if (!c->metacopy) ovl_set_upperdata(d_inode(c->dentry)); ovl_inode_update(d_inode(c->dentry), dget(temp)); @@ -907,7 +933,7 @@ out: static bool ovl_need_meta_copy_up(struct dentry *dentry, umode_t mode, int flags) { - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); if (!ofs->config.metacopy) return false; @@ -918,6 +944,19 @@ static bool ovl_need_meta_copy_up(struct dentry *dentry, umode_t mode, if (flags && ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC))) return false; + /* Fall back to full copy if no fsverity on source data and we require verity */ + if (ofs->config.verity_mode == OVL_VERITY_REQUIRE) { + struct path lowerdata; + + ovl_path_lowerdata(dentry, &lowerdata); + + if (WARN_ON_ONCE(lowerdata.dentry == NULL) || + ovl_ensure_verity_loaded(&lowerdata) || + !fsverity_active(d_inode(lowerdata.dentry))) { + return false; + } + } + return true; } @@ -984,6 +1023,8 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) if (err) goto out_free; + ovl_clear_flag(OVL_HAS_DIGEST, d_inode(c->dentry)); + ovl_clear_flag(OVL_VERIFIED_DIGEST, d_inode(c->dentry)); ovl_set_upperdata(d_inode(c->dentry)); out_free: kfree(capability); @@ -1078,7 +1119,7 @@ static int ovl_copy_up_flags(struct dentry *dentry, int flags) * not very important to optimize this case, so do lazy lowerdata lookup * before any copy up, so we can do it before taking ovl_inode_lock(). */ - err = ovl_maybe_lookup_lowerdata(dentry); + err = ovl_verify_lowerdata(dentry); if (err) return err; diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 35680b6e175b..c8c8588bd98c 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -174,28 +174,37 @@ static int ovl_connect_layer(struct dentry *dentry) * U = upper file handle * L = lower file handle * - * (*) Connecting an overlay dir from real lower dentry is not always + * (*) Decoding a connected overlay dir from real lower dentry is not always * possible when there are redirects in lower layers and non-indexed merge dirs. * To mitigate those case, we may copy up the lower dir ancestor before encode - * a lower dir file handle. + * of a decodable file handle for non-upper dir. * * Return 0 for upper file handle, > 0 for lower file handle or < 0 on error. */ static int ovl_check_encode_origin(struct dentry *dentry) { - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + bool decodable = ofs->config.nfs_export; + + /* Lower file handle for non-upper non-decodable */ + if (!ovl_dentry_upper(dentry) && !decodable) + return 0; /* Upper file handle for pure upper */ if (!ovl_dentry_lower(dentry)) return 0; /* - * Upper file handle for non-indexed upper. - * * Root is never indexed, so if there's an upper layer, encode upper for * root. */ - if (ovl_dentry_upper(dentry) && + if (dentry == dentry->d_sb->s_root) + return 0; + + /* + * Upper decodable file handle for non-indexed upper. + */ + if (ovl_dentry_upper(dentry) && decodable && !ovl_test_flag(OVL_INDEX, d_inode(dentry))) return 0; @@ -205,7 +214,7 @@ static int ovl_check_encode_origin(struct dentry *dentry) * ovl_connect_layer() will try to make origin's layer "connected" by * copying up a "connectable" ancestor. */ - if (d_is_dir(dentry) && ovl_upper_mnt(ofs)) + if (d_is_dir(dentry) && ovl_upper_mnt(ofs) && decodable) return ovl_connect_layer(dentry); /* Lower file handle for indexed and non-upper dir/non-dir */ @@ -435,7 +444,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb, struct dentry *real, const struct ovl_layer *layer) { - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); struct dentry *index = NULL; struct dentry *this = NULL; struct inode *inode; @@ -656,7 +665,7 @@ static struct dentry *ovl_get_dentry(struct super_block *sb, struct ovl_path *lowerpath, struct dentry *index) { - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); const struct ovl_layer *layer = upper ? &ofs->layers[0] : lowerpath->layer; struct dentry *real = upper ?: (index ?: lowerpath->dentry); @@ -681,7 +690,7 @@ static struct dentry *ovl_get_dentry(struct super_block *sb, static struct dentry *ovl_upper_fh_to_d(struct super_block *sb, struct ovl_fh *fh) { - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); struct dentry *dentry; struct dentry *upper; @@ -701,7 +710,7 @@ static struct dentry *ovl_upper_fh_to_d(struct super_block *sb, static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, struct ovl_fh *fh) { - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); struct ovl_path origin = { }; struct ovl_path *stack = &origin; struct dentry *dentry = NULL; @@ -876,3 +885,8 @@ const struct export_operations ovl_export_operations = { .get_name = ovl_get_name, .get_parent = ovl_get_parent, }; + +/* encode_fh() encodes non-decodable file handles with nfs_export=off */ +const struct export_operations ovl_export_fid_operations = { + .encode_fh = ovl_encode_fh, +}; diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 21245b00722a..693971d20280 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -19,7 +19,6 @@ struct ovl_aio_req { struct kiocb iocb; refcount_t ref; struct kiocb *orig_iocb; - struct fd fd; }; static struct kmem_cache *ovl_aio_request_cachep; @@ -115,8 +114,8 @@ static int ovl_real_fdget_meta(const struct file *file, struct fd *real, if (allow_meta) { ovl_path_real(dentry, &realpath); } else { - /* lazy lookup of lowerdata */ - err = ovl_maybe_lookup_lowerdata(dentry); + /* lazy lookup and verify of lowerdata */ + err = ovl_verify_lowerdata(dentry); if (err) return err; @@ -159,8 +158,8 @@ static int ovl_open(struct inode *inode, struct file *file) struct path realpath; int err; - /* lazy lookup of lowerdata */ - err = ovl_maybe_lookup_lowerdata(dentry); + /* lazy lookup and verify lowerdata */ + err = ovl_verify_lowerdata(dentry); if (err) return err; @@ -239,6 +238,7 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) static void ovl_file_accessed(struct file *file) { struct inode *inode, *upperinode; + struct timespec64 ctime, uctime; if (file->f_flags & O_NOATIME) return; @@ -249,10 +249,12 @@ static void ovl_file_accessed(struct file *file) if (!upperinode) return; + ctime = inode_get_ctime(inode); + uctime = inode_get_ctime(upperinode); if ((!timespec64_equal(&inode->i_mtime, &upperinode->i_mtime) || - !timespec64_equal(&inode->i_ctime, &upperinode->i_ctime))) { + !timespec64_equal(&ctime, &uctime))) { inode->i_mtime = upperinode->i_mtime; - inode->i_ctime = upperinode->i_ctime; + inode_set_ctime_to_ts(inode, uctime); } touch_atime(&file->f_path); @@ -277,7 +279,7 @@ static rwf_t ovl_iocb_to_rwf(int ifl) static inline void ovl_aio_put(struct ovl_aio_req *aio_req) { if (refcount_dec_and_test(&aio_req->ref)) { - fdput(aio_req->fd); + fput(aio_req->iocb.ki_filp); kmem_cache_free(ovl_aio_request_cachep, aio_req); } } @@ -290,10 +292,7 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) if (iocb->ki_flags & IOCB_WRITE) { struct inode *inode = file_inode(orig_iocb->ki_filp); - /* Actually acquired in ovl_write_iter() */ - __sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb, - SB_FREEZE_WRITE); - file_end_write(iocb->ki_filp); + kiocb_end_write(iocb); ovl_copyattr(inode); } @@ -342,10 +341,9 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (!aio_req) goto out; - aio_req->fd = real; real.flags = 0; aio_req->orig_iocb = iocb; - kiocb_clone(&aio_req->iocb, iocb, real.file); + kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); aio_req->iocb.ki_complete = ovl_aio_rw_complete; refcount_set(&aio_req->ref, 2); ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); @@ -393,6 +391,12 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (!ovl_should_sync(OVL_FS(inode->i_sb))) ifl &= ~(IOCB_DSYNC | IOCB_SYNC); + /* + * Overlayfs doesn't support deferred completions, don't copy + * this property in case it is set by the issuer. + */ + ifl &= ~IOCB_DIO_CALLER_COMP; + old_cred = ovl_override_creds(file_inode(file)->i_sb); if (is_sync_kiocb(iocb)) { file_start_write(real.file); @@ -409,17 +413,13 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (!aio_req) goto out; - file_start_write(real.file); - /* Pacify lockdep, same trick as done in aio_write() */ - __sb_writers_release(file_inode(real.file)->i_sb, - SB_FREEZE_WRITE); - aio_req->fd = real; real.flags = 0; aio_req->orig_iocb = iocb; - kiocb_clone(&aio_req->iocb, iocb, real.file); + kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); aio_req->iocb.ki_flags = ifl; aio_req->iocb.ki_complete = ovl_aio_rw_complete; refcount_set(&aio_req->ref, 2); + kiocb_start_write(&aio_req->iocb); ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); ovl_aio_put(aio_req); if (ret != -EIOCBQUEUED) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index a63e57447be9..83ef66644c21 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -341,7 +341,7 @@ static const char *ovl_get_link(struct dentry *dentry, bool ovl_is_private_xattr(struct super_block *sb, const char *name) { - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); if (ofs->config.userxattr) return strncmp(name, OVL_XATTR_USER_PREFIX, @@ -693,10 +693,10 @@ int ovl_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, } #endif -int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags) +int ovl_update_time(struct inode *inode, int flags) { if (flags & S_ATIME) { - struct ovl_fs *ofs = inode->i_sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(inode->i_sb); struct path upperpath = { .mnt = ovl_upper_mnt(ofs), .dentry = ovl_upperdentry_dereference(OVL_I(inode)), @@ -1291,7 +1291,7 @@ struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir) static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper, struct dentry *lower, bool index) { - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); /* No, if pure upper */ if (!lower) @@ -1311,7 +1311,7 @@ static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper, return false; /* No, if non-indexed upper with NFS export */ - if (sb->s_export_op && upper) + if (ofs->config.nfs_export && upper) return false; /* Otherwise, hash by lower inode for fsnotify */ diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 57adf911735f..80391c687c2a 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -25,7 +25,7 @@ struct ovl_lookup_data { bool stop; bool last; char *redirect; - bool metacopy; + int metacopy; /* Referring to last redirect xattr */ bool absolute_redirect; }; @@ -171,8 +171,9 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh, * layer where file handle will be decoded. * In case of uuid=off option just make sure that stored uuid is null. */ - if (ofs->config.uuid ? !uuid_equal(&fh->fb.uuid, &mnt->mnt_sb->s_uuid) : - !uuid_is_null(&fh->fb.uuid)) + if (ovl_origin_uuid(ofs) ? + !uuid_equal(&fh->fb.uuid, &mnt->mnt_sb->s_uuid) : + !uuid_is_null(&fh->fb.uuid)) return NULL; bytes = (fh->fb.len - offsetof(struct ovl_fb, fid)); @@ -270,7 +271,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, d->stop = true; goto put_and_out; } - err = ovl_check_metacopy_xattr(OVL_FS(d->sb), &path); + err = ovl_check_metacopy_xattr(OVL_FS(d->sb), &path, NULL); if (err < 0) goto out_err; @@ -889,8 +890,58 @@ static int ovl_fix_origin(struct ovl_fs *ofs, struct dentry *dentry, return err; } +static int ovl_maybe_validate_verity(struct dentry *dentry) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + struct inode *inode = d_inode(dentry); + struct path datapath, metapath; + int err; + + if (!ofs->config.verity_mode || + !ovl_is_metacopy_dentry(dentry) || + ovl_test_flag(OVL_VERIFIED_DIGEST, inode)) + return 0; + + if (!ovl_test_flag(OVL_HAS_DIGEST, inode)) { + if (ofs->config.verity_mode == OVL_VERITY_REQUIRE) { + pr_warn_ratelimited("metacopy file '%pd' has no digest specified\n", + dentry); + return -EIO; + } + return 0; + } + + ovl_path_lowerdata(dentry, &datapath); + if (!datapath.dentry) + return -EIO; + + ovl_path_real(dentry, &metapath); + if (!metapath.dentry) + return -EIO; + + err = ovl_inode_lock_interruptible(inode); + if (err) + return err; + + if (!ovl_test_flag(OVL_VERIFIED_DIGEST, inode)) { + const struct cred *old_cred; + + old_cred = ovl_override_creds(dentry->d_sb); + + err = ovl_validate_verity(ofs, &metapath, &datapath); + if (err == 0) + ovl_set_flag(OVL_VERIFIED_DIGEST, inode); + + revert_creds(old_cred); + } + + ovl_inode_unlock(inode); + + return err; +} + /* Lazy lookup of lowerdata */ -int ovl_maybe_lookup_lowerdata(struct dentry *dentry) +static int ovl_maybe_lookup_lowerdata(struct dentry *dentry) { struct inode *inode = d_inode(dentry); const char *redirect = ovl_lowerdata_redirect(inode); @@ -935,12 +986,23 @@ out_err: goto out; } +int ovl_verify_lowerdata(struct dentry *dentry) +{ + int err; + + err = ovl_maybe_lookup_lowerdata(dentry); + if (err) + return err; + + return ovl_maybe_validate_verity(dentry); +} + struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct ovl_entry *oe = NULL; const struct cred *old_cred; - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct ovl_entry *poe = OVL_E(dentry->d_parent); struct ovl_entry *roe = OVL_E(dentry->d_sb->s_root); struct ovl_path *stack = NULL, *origin_path = NULL; @@ -955,6 +1017,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int i; int err; bool uppermetacopy = false; + int metacopy_size = 0; struct ovl_lookup_data d = { .sb = dentry->d_sb, .name = dentry->d_name, @@ -963,7 +1026,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, .stop = false, .last = ovl_redirect_follow(ofs) ? false : !ovl_numlower(poe), .redirect = NULL, - .metacopy = false, + .metacopy = 0, }; if (dentry->d_name.len > ofs->namelen) @@ -999,6 +1062,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (d.metacopy) uppermetacopy = true; + metacopy_size = d.metacopy; } if (d.redirect) { @@ -1076,6 +1140,9 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, origin = this; } + if (!upperdentry && !d.is_dir && !ctr && d.metacopy) + metacopy_size = d.metacopy; + if (d.metacopy && ctr) { /* * Do not store intermediate metacopy dentries in @@ -1120,7 +1187,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, /* Defer lookup of lowerdata in data-only layers to first access */ if (d.metacopy && ctr && ofs->numdatalayer && d.absolute_redirect) { - d.metacopy = false; + d.metacopy = 0; ctr++; } @@ -1211,10 +1278,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, upperredirect = NULL; goto out_free_oe; } - err = ovl_check_metacopy_xattr(ofs, &upperpath); + err = ovl_check_metacopy_xattr(ofs, &upperpath, NULL); if (err < 0) goto out_free_oe; uppermetacopy = err; + metacopy_size = err; } if (upperdentry || ctr) { @@ -1236,6 +1304,9 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, goto out_free_oe; if (upperdentry && !uppermetacopy) ovl_set_flag(OVL_UPPERDATA, inode); + + if (metacopy_size > OVL_METACOPY_MIN_SIZE) + ovl_set_flag(OVL_HAS_DIGEST, inode); } ovl_dentry_init_reval(dentry, upperdentry, OVL_I_E(inode)); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 9402591f12aa..9817b2dcb132 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -7,6 +7,7 @@ #include <linux/kernel.h> #include <linux/uuid.h> #include <linux/fs.h> +#include <linux/fsverity.h> #include <linux/namei.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> @@ -36,6 +37,7 @@ enum ovl_xattr { OVL_XATTR_IMPURE, OVL_XATTR_NLINK, OVL_XATTR_UPPER, + OVL_XATTR_UUID, OVL_XATTR_METACOPY, OVL_XATTR_PROTATTR, }; @@ -49,6 +51,8 @@ enum ovl_inode_flag { OVL_UPPERDATA, /* Inode number will remain constant over copy up. */ OVL_CONST_INO, + OVL_HAS_DIGEST, + OVL_VERIFIED_DIGEST, }; enum ovl_entry_flag { @@ -65,11 +69,24 @@ enum { }; enum { + OVL_UUID_OFF, + OVL_UUID_NULL, + OVL_UUID_AUTO, + OVL_UUID_ON, +}; + +enum { OVL_XINO_OFF, OVL_XINO_AUTO, OVL_XINO_ON, }; +enum { + OVL_VERITY_OFF, + OVL_VERITY_ON, + OVL_VERITY_REQUIRE, +}; + /* * The tuple (fh,uuid) is a universal unique identifier for a copy up origin, * where: @@ -126,6 +143,26 @@ struct ovl_fh { #define OVL_FH_FID_OFFSET (OVL_FH_WIRE_OFFSET + \ offsetof(struct ovl_fb, fid)) +/* On-disk format for "metacopy" xattr (if non-zero size) */ +struct ovl_metacopy { + u8 version; /* 0 */ + u8 len; /* size of this header + used digest bytes */ + u8 flags; + u8 digest_algo; /* FS_VERITY_HASH_ALG_* constant, 0 for no digest */ + u8 digest[FS_VERITY_MAX_DIGEST_SIZE]; /* Only the used part on disk */ +} __packed; + +#define OVL_METACOPY_MAX_SIZE (sizeof(struct ovl_metacopy)) +#define OVL_METACOPY_MIN_SIZE (OVL_METACOPY_MAX_SIZE - FS_VERITY_MAX_DIGEST_SIZE) +#define OVL_METACOPY_INIT { 0, OVL_METACOPY_MIN_SIZE } + +static inline int ovl_metadata_digest_size(const struct ovl_metacopy *metacopy) +{ + if (metacopy->len < OVL_METACOPY_MIN_SIZE) + return 0; + return (int)metacopy->len - OVL_METACOPY_MIN_SIZE; +} + extern const char *const ovl_xattr_table[][2]; static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox) { @@ -430,6 +467,8 @@ bool ovl_already_copied_up(struct dentry *dentry, int flags); bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, enum ovl_xattr ox); bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path); +bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs, + const struct path *upperpath); static inline bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *upperdentry) @@ -452,9 +491,20 @@ bool ovl_need_index(struct dentry *dentry); int ovl_nlink_start(struct dentry *dentry); void ovl_nlink_end(struct dentry *dentry); int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); -int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path); +int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path, + struct ovl_metacopy *data); +int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d, + struct ovl_metacopy *metacopy); bool ovl_is_metacopy_dentry(struct dentry *dentry); char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding); +int ovl_ensure_verity_loaded(struct path *path); +int ovl_get_verity_xattr(struct ovl_fs *ofs, const struct path *path, + u8 *digest_buf, int *buf_length); +int ovl_validate_verity(struct ovl_fs *ofs, + struct path *metapath, + struct path *datapath); +int ovl_get_verity_digest(struct ovl_fs *ofs, struct path *src, + struct ovl_metacopy *metacopy); int ovl_sync_status(struct ovl_fs *ofs); static inline void ovl_set_flag(unsigned long flag, struct inode *inode) @@ -494,6 +544,17 @@ static inline bool ovl_redirect_dir(struct ovl_fs *ofs) return ofs->config.redirect_mode == OVL_REDIRECT_ON; } +static inline bool ovl_origin_uuid(struct ovl_fs *ofs) +{ + return ofs->config.uuid != OVL_UUID_OFF; +} + +static inline bool ovl_has_fsid(struct ovl_fs *ofs) +{ + return ofs->config.uuid == OVL_UUID_ON || + ofs->config.uuid == OVL_UUID_AUTO; +} + /* * With xino=auto, we do best effort to keep all inodes on same st_dev and * d_ino consistent with st_ino. @@ -574,7 +635,7 @@ struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh); struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, struct dentry *origin, bool verify); int ovl_path_next(int idx, struct dentry *dentry, struct path *path); -int ovl_maybe_lookup_lowerdata(struct dentry *dentry); +int ovl_verify_lowerdata(struct dentry *dentry); struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); bool ovl_lower_positive(struct dentry *dentry); @@ -665,7 +726,7 @@ static inline struct posix_acl *ovl_get_acl_path(const struct path *path, } #endif -int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags); +int ovl_update_time(struct inode *inode, int flags); bool ovl_is_private_xattr(struct super_block *sb, const char *name); struct ovl_inode_params { @@ -759,6 +820,7 @@ int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower, /* export.c */ extern const struct export_operations ovl_export_operations; +extern const struct export_operations ovl_export_fid_operations; /* super.c */ int ovl_fill_super(struct super_block *sb, struct fs_context *fc); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 306e1ecdc96d..e9539f98e86a 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -10,8 +10,9 @@ struct ovl_config { char *workdir; bool default_permissions; int redirect_mode; + int verity_mode; bool index; - bool uuid; + int uuid; bool nfs_export; int xino; bool metacopy; @@ -81,6 +82,7 @@ struct ovl_fs { const struct cred *creator_cred; bool tmpfile; bool noxattr; + bool nofh; /* Did we take the inuse lock? */ bool upperdir_locked; bool workdir_locked; @@ -115,8 +117,13 @@ static inline struct mnt_idmap *ovl_upper_mnt_idmap(struct ovl_fs *ofs) return mnt_idmap(ovl_upper_mnt(ofs)); } +extern struct file_system_type ovl_fs_type; + static inline struct ovl_fs *OVL_FS(struct super_block *sb) { + if (IS_ENABLED(CONFIG_OVERLAY_FS_DEBUG)) + WARN_ON_ONCE(sb->s_type != &ovl_fs_type); + return (struct ovl_fs *)sb->s_fs_info; } diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index a63160dbb0f9..b9355bb6d75a 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -55,6 +55,7 @@ enum { Opt_userxattr, Opt_xino, Opt_metacopy, + Opt_verity, Opt_volatile, }; @@ -64,6 +65,24 @@ static const struct constant_table ovl_parameter_bool[] = { {} }; +static const struct constant_table ovl_parameter_uuid[] = { + { "off", OVL_UUID_OFF }, + { "null", OVL_UUID_NULL }, + { "auto", OVL_UUID_AUTO }, + { "on", OVL_UUID_ON }, + {} +}; + +static const char *ovl_uuid_mode(struct ovl_config *config) +{ + return ovl_parameter_uuid[config->uuid].name; +} + +static int ovl_uuid_def(void) +{ + return OVL_UUID_AUTO; +} + static const struct constant_table ovl_parameter_xino[] = { { "off", OVL_XINO_OFF }, { "auto", OVL_XINO_AUTO }, @@ -101,6 +120,23 @@ static int ovl_redirect_mode_def(void) OVL_REDIRECT_NOFOLLOW; } +static const struct constant_table ovl_parameter_verity[] = { + { "off", OVL_VERITY_OFF }, + { "on", OVL_VERITY_ON }, + { "require", OVL_VERITY_REQUIRE }, + {} +}; + +static const char *ovl_verity_mode(struct ovl_config *config) +{ + return ovl_parameter_verity[config->verity_mode].name; +} + +static int ovl_verity_mode_def(void) +{ + return OVL_VERITY_OFF; +} + #define fsparam_string_empty(NAME, OPT) \ __fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL) @@ -111,11 +147,12 @@ const struct fs_parameter_spec ovl_parameter_spec[] = { fsparam_flag("default_permissions", Opt_default_permissions), fsparam_enum("redirect_dir", Opt_redirect_dir, ovl_parameter_redirect_dir), fsparam_enum("index", Opt_index, ovl_parameter_bool), - fsparam_enum("uuid", Opt_uuid, ovl_parameter_bool), + fsparam_enum("uuid", Opt_uuid, ovl_parameter_uuid), fsparam_enum("nfs_export", Opt_nfs_export, ovl_parameter_bool), fsparam_flag("userxattr", Opt_userxattr), fsparam_enum("xino", Opt_xino, ovl_parameter_xino), fsparam_enum("metacopy", Opt_metacopy, ovl_parameter_bool), + fsparam_enum("verity", Opt_verity, ovl_parameter_verity), fsparam_flag("volatile", Opt_volatile), {} }; @@ -572,6 +609,9 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param) config->metacopy = result.uint_32; ctx->set.metacopy = true; break; + case Opt_verity: + config->verity_mode = result.uint_32; + break; case Opt_volatile: config->ovl_volatile = true; break; @@ -622,7 +662,7 @@ static void ovl_free(struct fs_context *fc) static int ovl_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); struct super_block *upper_sb; int ret = 0; @@ -679,7 +719,7 @@ int ovl_init_fs_context(struct fs_context *fc) ofs->config.redirect_mode = ovl_redirect_mode_def(); ofs->config.index = ovl_index_def; - ofs->config.uuid = true; + ofs->config.uuid = ovl_uuid_def(); ofs->config.nfs_export = ovl_nfs_export_def; ofs->config.xino = ovl_xino_def(); ofs->config.metacopy = ovl_metacopy_def; @@ -762,6 +802,23 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, config->ovl_volatile = false; } + if (!config->upperdir && config->uuid == OVL_UUID_ON) { + pr_info("option \"uuid=on\" requires an upper fs, falling back to uuid=null.\n"); + config->uuid = OVL_UUID_NULL; + } + + /* Resolve verity -> metacopy dependency */ + if (config->verity_mode && !config->metacopy) { + /* Don't allow explicit specified conflicting combinations */ + if (set.metacopy) { + pr_err("conflicting options: metacopy=off,verity=%s\n", + ovl_verity_mode(config)); + return -EINVAL; + } + /* Otherwise automatically enable metacopy. */ + config->metacopy = true; + } + /* * This is to make the logic below simpler. It doesn't make any other * difference, since redirect_dir=on is only used for upper. @@ -769,13 +826,18 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, if (!config->upperdir && config->redirect_mode == OVL_REDIRECT_FOLLOW) config->redirect_mode = OVL_REDIRECT_ON; - /* Resolve metacopy -> redirect_dir dependency */ + /* Resolve verity -> metacopy -> redirect_dir dependency */ if (config->metacopy && config->redirect_mode != OVL_REDIRECT_ON) { if (set.metacopy && set.redirect) { pr_err("conflicting options: metacopy=on,redirect_dir=%s\n", ovl_redirect_mode(config)); return -EINVAL; } + if (config->verity_mode && set.redirect) { + pr_err("conflicting options: verity=%s,redirect_dir=%s\n", + ovl_verity_mode(config), ovl_redirect_mode(config)); + return -EINVAL; + } if (set.redirect) { /* * There was an explicit redirect_dir=... that resulted @@ -812,7 +874,7 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, } } - /* Resolve nfs_export -> !metacopy dependency */ + /* Resolve nfs_export -> !metacopy && !verity dependency */ if (config->nfs_export && config->metacopy) { if (set.nfs_export && set.metacopy) { pr_err("conflicting options: nfs_export=on,metacopy=on\n"); @@ -825,6 +887,14 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, */ pr_info("disabling nfs_export due to metacopy=on\n"); config->nfs_export = false; + } else if (config->verity_mode) { + /* + * There was an explicit verity=.. that resulted + * in this conflict. + */ + pr_info("disabling nfs_export due to verity=%s\n", + ovl_verity_mode(config)); + config->nfs_export = false; } else { /* * There was an explicit nfs_export=on that resulted @@ -836,7 +906,7 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, } - /* Resolve userxattr -> !redirect && !metacopy dependency */ + /* Resolve userxattr -> !redirect && !metacopy && !verity dependency */ if (config->userxattr) { if (set.redirect && config->redirect_mode != OVL_REDIRECT_NOFOLLOW) { @@ -848,6 +918,11 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, pr_err("conflicting options: userxattr,metacopy=on\n"); return -EINVAL; } + if (config->verity_mode) { + pr_err("conflicting options: userxattr,verity=%s\n", + ovl_verity_mode(config)); + return -EINVAL; + } /* * Silently disable default setting of redirect and metacopy. * This shall be the default in the future as well: these @@ -872,7 +947,7 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, int ovl_show_options(struct seq_file *m, struct dentry *dentry) { struct super_block *sb = dentry->d_sb; - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); size_t nr, nr_merged_lower = ofs->numlayer - ofs->numdatalayer; const struct ovl_layer *data_layers = &ofs->layers[nr_merged_lower]; @@ -895,8 +970,8 @@ int ovl_show_options(struct seq_file *m, struct dentry *dentry) ovl_redirect_mode(&ofs->config)); if (ofs->config.index != ovl_index_def) seq_printf(m, ",index=%s", ofs->config.index ? "on" : "off"); - if (!ofs->config.uuid) - seq_puts(m, ",uuid=off"); + if (ofs->config.uuid != ovl_uuid_def()) + seq_printf(m, ",uuid=%s", ovl_uuid_mode(&ofs->config)); if (ofs->config.nfs_export != ovl_nfs_export_def) seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ? "on" : "off"); @@ -909,5 +984,8 @@ int ovl_show_options(struct seq_file *m, struct dentry *dentry) seq_puts(m, ",volatile"); if (ofs->config.userxattr) seq_puts(m, ",userxattr"); + if (ofs->config.verity_mode != ovl_verity_mode_def()) + seq_printf(m, ",verity=%s", + ovl_verity_mode(&ofs->config)); return 0; } diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index cc8977498c48..def266b5e2a3 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -32,6 +32,7 @@ static struct dentry *ovl_d_real(struct dentry *dentry, const struct inode *inode) { struct dentry *real = NULL, *lower; + int err; /* It's an overlay file */ if (inode && d_inode(dentry) == inode) @@ -58,7 +59,9 @@ static struct dentry *ovl_d_real(struct dentry *dentry, * uprobes on offset within the file, so lowerdata should be available * when setting the uprobe. */ - ovl_maybe_lookup_lowerdata(dentry); + err = ovl_verify_lowerdata(dentry); + if (err) + goto bug; lower = ovl_dentry_lowerdata(dentry); if (!lower) goto bug; @@ -182,7 +185,7 @@ static void ovl_destroy_inode(struct inode *inode) static void ovl_put_super(struct super_block *sb) { - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); if (ofs) ovl_free_fs(ofs); @@ -191,7 +194,7 @@ static void ovl_put_super(struct super_block *sb) /* Sync real dirty inodes in upper filesystem (if it exists) */ static int ovl_sync_fs(struct super_block *sb, int wait) { - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); struct super_block *upper_sb; int ret; @@ -239,8 +242,9 @@ static int ovl_sync_fs(struct super_block *sb, int wait) */ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; - struct dentry *root_dentry = dentry->d_sb->s_root; + struct super_block *sb = dentry->d_sb; + struct ovl_fs *ofs = OVL_FS(sb); + struct dentry *root_dentry = sb->s_root; struct path path; int err; @@ -250,6 +254,8 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) if (!err) { buf->f_namelen = ofs->namelen; buf->f_type = OVERLAYFS_SUPER_MAGIC; + if (ovl_has_fsid(ofs)) + buf->f_fsid = uuid_to_fsid(sb->s_uuid.b); } return err; @@ -397,6 +403,7 @@ static int ovl_lower_dir(const char *name, struct path *path, pr_warn("fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n", name); } + ofs->nofh |= !fh_type; /* * Decoding origin file handle is required for persistent st_ino. * Without persistent st_ino, xino=auto falls back to xino=off. @@ -770,6 +777,10 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, ofs->config.index = false; pr_warn("...falling back to index=off.\n"); } + if (ovl_has_fsid(ofs)) { + ofs->config.uuid = OVL_UUID_NULL; + pr_warn("...falling back to uuid=null.\n"); + } /* * xattr support is required for persistent st_ino. * Without persistent st_ino, xino=auto falls back to xino=off. @@ -815,6 +826,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, ofs->config.index = false; pr_warn("upper fs does not support file handles, falling back to index=off.\n"); } + ofs->nofh |= !fh_type; /* Check if upper fs has 32bit inode numbers */ if (fh_type != FILEID_INO32_GEN) @@ -1416,9 +1428,12 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) if (!ovl_upper_mnt(ofs)) sb->s_flags |= SB_RDONLY; - if (!ofs->config.uuid && ofs->numfs > 1) { - pr_warn("The uuid=off requires a single fs for lower and upper, falling back to uuid=on.\n"); - ofs->config.uuid = true; + if (!ovl_origin_uuid(ofs) && ofs->numfs > 1) { + pr_warn("The uuid=off requires a single fs for lower and upper, falling back to uuid=null.\n"); + ofs->config.uuid = OVL_UUID_NULL; + } else if (ovl_has_fsid(ofs) && ovl_upper_mnt(ofs)) { + /* Use per instance persistent uuid/fsid */ + ovl_init_uuid_xattr(sb, ofs, &ctx->upper); } if (!ovl_force_readonly(ofs) && ofs->config.index) { @@ -1449,8 +1464,15 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) ofs->config.nfs_export = false; } + /* + * Support encoding decodable file handles with nfs_export=on + * and encoding non-decodable file handles with nfs_export=off + * if all layers support file handles. + */ if (ofs->config.nfs_export) sb->s_export_op = &ovl_export_operations; + else if (!ofs->nofh) + sb->s_export_op = &ovl_export_fid_operations; /* Never override disk quota limits or use reserved space */ cap_lower(cred->cap_effective, CAP_SYS_RESOURCE); @@ -1479,7 +1501,7 @@ out_err: return err; } -static struct file_system_type ovl_fs_type = { +struct file_system_type ovl_fs_type = { .owner = THIS_MODULE, .name = "overlay", .init_fs_context = ovl_init_fs_context, diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 7ef9e13c404a..89e0d60d35b6 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -10,6 +10,7 @@ #include <linux/cred.h> #include <linux/xattr.h> #include <linux/exportfs.h> +#include <linux/file.h> #include <linux/fileattr.h> #include <linux/uuid.h> #include <linux/namei.h> @@ -18,25 +19,25 @@ int ovl_want_write(struct dentry *dentry) { - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); return mnt_want_write(ovl_upper_mnt(ofs)); } void ovl_drop_write(struct dentry *dentry) { - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); mnt_drop_write(ovl_upper_mnt(ofs)); } struct dentry *ovl_workdir(struct dentry *dentry) { - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); return ofs->workdir; } const struct cred *ovl_override_creds(struct super_block *sb) { - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); return override_creds(ofs->creator_cred); } @@ -62,7 +63,7 @@ int ovl_can_decode_fh(struct super_block *sb) struct dentry *ovl_indexdir(struct super_block *sb) { - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); return ofs->indexdir; } @@ -70,7 +71,7 @@ struct dentry *ovl_indexdir(struct super_block *sb) /* Index all files on copy up. For now only enabled for NFS export */ bool ovl_index_all(struct super_block *sb) { - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); return ofs->config.nfs_export && ofs->config.index; } @@ -78,7 +79,7 @@ bool ovl_index_all(struct super_block *sb) /* Verify lower origin on lookup. For now only enabled for NFS export */ bool ovl_verify_lower(struct super_block *sb) { - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); return ofs->config.nfs_export && ofs->config.index; } @@ -203,7 +204,7 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry) void ovl_path_upper(struct dentry *dentry, struct path *path) { - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); path->mnt = ovl_upper_mnt(ofs); path->dentry = ovl_dentry_upper(dentry); @@ -675,6 +676,65 @@ bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path) return false; } +/* + * Load persistent uuid from xattr into s_uuid if found, or store a new + * random generated value in s_uuid and in xattr. + */ +bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs, + const struct path *upperpath) +{ + bool set = false; + int res; + + /* Try to load existing persistent uuid */ + res = ovl_path_getxattr(ofs, upperpath, OVL_XATTR_UUID, sb->s_uuid.b, + UUID_SIZE); + if (res == UUID_SIZE) + return true; + + if (res != -ENODATA) + goto fail; + + /* + * With uuid=auto, if uuid xattr is found, it will be used. + * If uuid xattrs is not found, generate a persistent uuid only on mount + * of new overlays where upper root dir is not yet marked as impure. + * An upper dir is marked as impure on copy up or lookup of its subdirs. + */ + if (ofs->config.uuid == OVL_UUID_AUTO) { + res = ovl_path_getxattr(ofs, upperpath, OVL_XATTR_IMPURE, NULL, + 0); + if (res > 0) { + /* Any mount of old overlay - downgrade to uuid=null */ + ofs->config.uuid = OVL_UUID_NULL; + return true; + } else if (res == -ENODATA) { + /* First mount of new overlay - upgrade to uuid=on */ + ofs->config.uuid = OVL_UUID_ON; + } else if (res < 0) { + goto fail; + } + + } + + /* Generate overlay instance uuid */ + uuid_gen(&sb->s_uuid); + + /* Try to store persistent uuid */ + set = true; + res = ovl_setxattr(ofs, upperpath->dentry, OVL_XATTR_UUID, sb->s_uuid.b, + UUID_SIZE); + if (res == 0) + return true; + +fail: + memset(sb->s_uuid.b, 0, UUID_SIZE); + ofs->config.uuid = OVL_UUID_NULL; + pr_warn("failed to %s uuid (%pd2, err=%i); falling back to uuid=null.\n", + set ? "set" : "get", upperpath->dentry, res); + return false; +} + bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, enum ovl_xattr ox) { @@ -697,6 +757,7 @@ bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, #define OVL_XATTR_IMPURE_POSTFIX "impure" #define OVL_XATTR_NLINK_POSTFIX "nlink" #define OVL_XATTR_UPPER_POSTFIX "upper" +#define OVL_XATTR_UUID_POSTFIX "uuid" #define OVL_XATTR_METACOPY_POSTFIX "metacopy" #define OVL_XATTR_PROTATTR_POSTFIX "protattr" @@ -711,6 +772,7 @@ const char *const ovl_xattr_table[][2] = { OVL_XATTR_TAB_ENTRY(OVL_XATTR_IMPURE), OVL_XATTR_TAB_ENTRY(OVL_XATTR_NLINK), OVL_XATTR_TAB_ENTRY(OVL_XATTR_UPPER), + OVL_XATTR_TAB_ENTRY(OVL_XATTR_UUID), OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY), OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR), }; @@ -1054,8 +1116,12 @@ err: return -EIO; } -/* err < 0, 0 if no metacopy xattr, 1 if metacopy xattr found */ -int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path) +/* + * err < 0, 0 if no metacopy xattr, metacopy data size if xattr found. + * an empty xattr returns OVL_METACOPY_MIN_SIZE to distinguish from no xattr value. + */ +int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path, + struct ovl_metacopy *data) { int res; @@ -1063,7 +1129,8 @@ int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path) if (!S_ISREG(d_inode(path->dentry)->i_mode)) return 0; - res = ovl_path_getxattr(ofs, path, OVL_XATTR_METACOPY, NULL, 0); + res = ovl_path_getxattr(ofs, path, OVL_XATTR_METACOPY, + data, data ? OVL_METACOPY_MAX_SIZE : 0); if (res < 0) { if (res == -ENODATA || res == -EOPNOTSUPP) return 0; @@ -1077,12 +1144,48 @@ int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path) goto out; } - return 1; + if (res == 0) { + /* Emulate empty data for zero size metacopy xattr */ + res = OVL_METACOPY_MIN_SIZE; + if (data) { + memset(data, 0, res); + data->len = res; + } + } else if (res < OVL_METACOPY_MIN_SIZE) { + pr_warn_ratelimited("metacopy file '%pd' has too small xattr\n", + path->dentry); + return -EIO; + } else if (data) { + if (data->version != 0) { + pr_warn_ratelimited("metacopy file '%pd' has unsupported version\n", + path->dentry); + return -EIO; + } + if (res != data->len) { + pr_warn_ratelimited("metacopy file '%pd' has invalid xattr size\n", + path->dentry); + return -EIO; + } + } + + return res; out: pr_warn_ratelimited("failed to get metacopy (%i)\n", res); return res; } +int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d, struct ovl_metacopy *metacopy) +{ + size_t len = metacopy->len; + + /* If no flags or digest fall back to empty metacopy file */ + if (metacopy->version == 0 && metacopy->flags == 0 && metacopy->digest_algo == 0) + len = 0; + + return ovl_check_setxattr(ofs, d, OVL_XATTR_METACOPY, + metacopy, len, -EOPNOTSUPP); +} + bool ovl_is_metacopy_dentry(struct dentry *dentry) { struct ovl_entry *oe = OVL_E(dentry); @@ -1145,6 +1248,112 @@ err_free: return ERR_PTR(res); } +/* Call with mounter creds as it may open the file */ +int ovl_ensure_verity_loaded(struct path *datapath) +{ + struct inode *inode = d_inode(datapath->dentry); + struct file *filp; + + if (!fsverity_active(inode) && IS_VERITY(inode)) { + /* + * If this inode was not yet opened, the verity info hasn't been + * loaded yet, so we need to do that here to force it into memory. + */ + filp = kernel_file_open(datapath, O_RDONLY, inode, current_cred()); + if (IS_ERR(filp)) + return PTR_ERR(filp); + fput(filp); + } + + return 0; +} + +int ovl_validate_verity(struct ovl_fs *ofs, + struct path *metapath, + struct path *datapath) +{ + struct ovl_metacopy metacopy_data; + u8 actual_digest[FS_VERITY_MAX_DIGEST_SIZE]; + int xattr_digest_size, digest_size; + int xattr_size, err; + u8 verity_algo; + + if (!ofs->config.verity_mode || + /* Verity only works on regular files */ + !S_ISREG(d_inode(metapath->dentry)->i_mode)) + return 0; + + xattr_size = ovl_check_metacopy_xattr(ofs, metapath, &metacopy_data); + if (xattr_size < 0) + return xattr_size; + + if (!xattr_size || !metacopy_data.digest_algo) { + if (ofs->config.verity_mode == OVL_VERITY_REQUIRE) { + pr_warn_ratelimited("metacopy file '%pd' has no digest specified\n", + metapath->dentry); + return -EIO; + } + return 0; + } + + xattr_digest_size = ovl_metadata_digest_size(&metacopy_data); + + err = ovl_ensure_verity_loaded(datapath); + if (err < 0) { + pr_warn_ratelimited("lower file '%pd' failed to load fs-verity info\n", + datapath->dentry); + return -EIO; + } + + digest_size = fsverity_get_digest(d_inode(datapath->dentry), actual_digest, + &verity_algo, NULL); + if (digest_size == 0) { + pr_warn_ratelimited("lower file '%pd' has no fs-verity digest\n", datapath->dentry); + return -EIO; + } + + if (xattr_digest_size != digest_size || + metacopy_data.digest_algo != verity_algo || + memcmp(metacopy_data.digest, actual_digest, xattr_digest_size) != 0) { + pr_warn_ratelimited("lower file '%pd' has the wrong fs-verity digest\n", + datapath->dentry); + return -EIO; + } + + return 0; +} + +int ovl_get_verity_digest(struct ovl_fs *ofs, struct path *src, + struct ovl_metacopy *metacopy) +{ + int err, digest_size; + + if (!ofs->config.verity_mode || !S_ISREG(d_inode(src->dentry)->i_mode)) + return 0; + + err = ovl_ensure_verity_loaded(src); + if (err < 0) { + pr_warn_ratelimited("lower file '%pd' failed to load fs-verity info\n", + src->dentry); + return -EIO; + } + + digest_size = fsverity_get_digest(d_inode(src->dentry), + metacopy->digest, &metacopy->digest_algo, NULL); + if (digest_size == 0 || + WARN_ON_ONCE(digest_size > FS_VERITY_MAX_DIGEST_SIZE)) { + if (ofs->config.verity_mode == OVL_VERITY_REQUIRE) { + pr_warn_ratelimited("lower file '%pd' has no fs-verity digest\n", + src->dentry); + return -EIO; + } + return 0; + } + + metacopy->len += digest_size; + return 0; +} + /* * ovl_sync_status() - Check fs sync status for volatile mounts * @@ -1202,6 +1411,6 @@ void ovl_copyattr(struct inode *inode) inode->i_mode = realinode->i_mode; inode->i_atime = realinode->i_atime; inode->i_mtime = realinode->i_mtime; - inode->i_ctime = realinode->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(realinode)); i_size_write(inode, i_size_read(realinode)); } diff --git a/fs/pipe.c b/fs/pipe.c index 2d88f73f585a..139190165a1c 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -489,7 +489,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) head = pipe->head; if (!pipe_full(head, pipe->tail, pipe->max_usage)) { unsigned int mask = pipe->ring_size - 1; - struct pipe_buffer *buf = &pipe->bufs[head & mask]; + struct pipe_buffer *buf; struct page *page = pipe->tmp_page; int copied; @@ -537,7 +537,6 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) break; } ret += copied; - buf->offset = 0; buf->len = copied; if (!iov_iter_count(from)) @@ -899,7 +898,7 @@ static struct inode * get_pipe_inode(void) inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); return inode; @@ -1236,7 +1235,7 @@ const struct file_operations pipefifo_fops = { * Currently we rely on the pipe array holding a power-of-2 number * of pages. Returns 0 on error. */ -unsigned int round_pipe_size(unsigned long size) +unsigned int round_pipe_size(unsigned int size) { if (size > (1U << 31)) return 0; @@ -1319,7 +1318,7 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) * Allocate a new array of pipe buffers and copy the info over. Returns the * pipe size if successful, or return -ERROR on error. */ -static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) +static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg) { unsigned long user_bufs; unsigned int nr_slots, size; @@ -1387,7 +1386,7 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) return pipe; } -long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg) { struct pipe_inode_info *pipe; long ret; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 7fa1b738bbab..a05fe94970ce 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -1027,7 +1027,7 @@ int simple_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, return error; } - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (IS_I_VERSION(inode)) inode_inc_iversion(inode); set_cached_acl(inode, type, acl); diff --git a/fs/proc/array.c b/fs/proc/array.c index d35bbf35a874..2c2efbe685d8 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -431,6 +431,11 @@ static inline void task_untag_mask(struct seq_file *m, struct mm_struct *mm) seq_printf(m, "untag_mask:\t%#lx\n", mm_untag_mask(mm)); } +__weak void arch_proc_pid_thread_features(struct seq_file *m, + struct task_struct *task) +{ +} + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -455,6 +460,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); task_context_switch_counts(m, task); + arch_proc_pid_thread_features(m, task); return 0; } diff --git a/fs/proc/base.c b/fs/proc/base.c index 9df3f4839662..ffd54617c354 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1902,7 +1902,7 @@ struct inode *proc_pid_make_inode(struct super_block *sb, ei = PROC_I(inode); inode->i_mode = mode; inode->i_ino = get_next_ino(); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_op = &proc_def_inode_operations; /* @@ -1966,7 +1966,7 @@ int pid_getattr(struct mnt_idmap *idmap, const struct path *path, struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->uid = GLOBAL_ROOT_UID; stat->gid = GLOBAL_ROOT_GID; @@ -3207,6 +3207,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, mm = get_task_mm(task); if (mm) { seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); + seq_printf(m, "ksm_zero_pages %lu\n", mm->ksm_zero_pages); seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); mmput(mm); @@ -3583,7 +3584,8 @@ static int proc_tid_comm_permission(struct mnt_idmap *idmap, } static const struct inode_operations proc_tid_comm_inode_operations = { - .permission = proc_tid_comm_permission, + .setattr = proc_setattr, + .permission = proc_tid_comm_permission, }; /* @@ -3813,11 +3815,10 @@ static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos, /* If we haven't found our starting place yet start * with the leader and walk nr threads forward. */ - pos = task = task->group_leader; - do { + for_each_thread(task, pos) { if (!nr--) goto found; - } while_each_thread(task, pos); + }; fail: pos = NULL; goto out; @@ -3899,7 +3900,7 @@ static int proc_task_getattr(struct mnt_idmap *idmap, { struct inode *inode = d_inode(path->dentry); struct task_struct *p = get_proc_task(inode); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (p) { stat->nlink += get_nr_threads(p); diff --git a/fs/proc/fd.c b/fs/proc/fd.c index b3140deebbbf..6276b3938842 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -352,7 +352,7 @@ static int proc_fd_getattr(struct mnt_idmap *idmap, struct inode *inode = d_inode(path->dentry); int rv = 0; - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); /* If it's a directory, put the number of open fds there */ if (S_ISDIR(inode->i_mode)) { diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 42ae38ff6e7e..775ce0bcf08c 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -146,7 +146,7 @@ static int proc_getattr(struct mnt_idmap *idmap, } } - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 67b09a1d9433..532dc9d240f7 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -660,7 +660,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) inode->i_private = de->data; inode->i_ino = de->low_ino; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); PROC_I(inode)->pde = de; if (is_empty_pde(de)) { make_empty_dir_inode(inode); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 9dda7e54b2d0..9a8f32f21ff5 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -289,9 +289,7 @@ struct proc_maps_private { struct inode *inode; struct task_struct *task; struct mm_struct *mm; -#ifdef CONFIG_MMU struct vma_iterator iter; -#endif #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 8dca4d6d96c7..45af9a989d40 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -17,6 +17,7 @@ #ifdef CONFIG_CMA #include <linux/cma.h> #endif +#include <linux/zswap.h> #include <asm/page.h> #include "internal.h" @@ -132,17 +133,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "VmallocChunk: ", 0ul); show_val_kb(m, "Percpu: ", pcpu_nr_pages()); -#ifdef CONFIG_MEMTEST - if (early_memtest_done) { - unsigned long early_memtest_bad_size_kb; - - early_memtest_bad_size_kb = early_memtest_bad_size>>10; - if (early_memtest_bad_size && !early_memtest_bad_size_kb) - early_memtest_bad_size_kb = 1; - /* When 0 is reported, it means there actually was a successful test */ - seq_printf(m, "EarlyMemtestBad: %5lu kB\n", early_memtest_bad_size_kb); - } -#endif + memtest_report_meminfo(m); #ifdef CONFIG_MEMORY_FAILURE seq_printf(m, "HardwareCorrupted: %5lu kB\n", diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index a0c0419872e3..2ba31b6d68c0 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -308,7 +308,7 @@ static int proc_tgid_net_getattr(struct mnt_idmap *idmap, net = get_proc_task_net(inode); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (net != NULL) { stat->nlink = net->proc_net->nlink; @@ -321,6 +321,7 @@ static int proc_tgid_net_getattr(struct mnt_idmap *idmap, const struct inode_operations proc_net_inode_operations = { .lookup = proc_tgid_net_lookup, .getattr = proc_tgid_net_getattr, + .setattr = proc_setattr, }; static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 5ea42653126e..c88854df0b62 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -19,8 +19,9 @@ #include <linux/kmemleak.h> #include "internal.h" -#define list_for_each_table_entry(entry, table) \ - for ((entry) = (table); (entry)->procname; (entry)++) +#define list_for_each_table_entry(entry, header) \ + entry = header->ctl_table; \ + for (size_t i = 0 ; i < header->ctl_table_size && entry->procname; ++i, entry++) static const struct dentry_operations proc_sys_dentry_operations; static const struct file_operations proc_sys_file_operations; @@ -43,7 +44,7 @@ static struct ctl_table sysctl_mount_point[] = { */ struct ctl_table_header *register_sysctl_mount_point(const char *path) { - return register_sysctl(path, sysctl_mount_point); + return register_sysctl_sz(path, sysctl_mount_point, 0); } EXPORT_SYMBOL(register_sysctl_mount_point); @@ -188,9 +189,10 @@ static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry) static void init_header(struct ctl_table_header *head, struct ctl_table_root *root, struct ctl_table_set *set, - struct ctl_node *node, struct ctl_table *table) + struct ctl_node *node, struct ctl_table *table, size_t table_size) { head->ctl_table = table; + head->ctl_table_size = table_size; head->ctl_table_arg = table; head->used = 0; head->count = 1; @@ -204,7 +206,7 @@ static void init_header(struct ctl_table_header *head, if (node) { struct ctl_table *entry; - list_for_each_table_entry(entry, table) { + list_for_each_table_entry(entry, head) { node->header = head; node++; } @@ -215,7 +217,7 @@ static void erase_header(struct ctl_table_header *head) { struct ctl_table *entry; - list_for_each_table_entry(entry, head->ctl_table) + list_for_each_table_entry(entry, head) erase_entry(head, entry); } @@ -242,7 +244,7 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) err = insert_links(header); if (err) goto fail_links; - list_for_each_table_entry(entry, header->ctl_table) { + list_for_each_table_entry(entry, header) { err = insert_entry(header, entry); if (err) goto fail; @@ -463,7 +465,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, head->count++; spin_unlock(&sysctl_lock); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_mode = table->mode; if (!S_ISDIR(table->mode)) { inode->i_mode |= S_IFREG; @@ -849,7 +851,7 @@ static int proc_sys_getattr(struct mnt_idmap *idmap, if (IS_ERR(head)) return PTR_ERR(head); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (table) stat->mode = (stat->mode & S_IFMT) | table->mode; @@ -973,7 +975,7 @@ static struct ctl_dir *new_dir(struct ctl_table_set *set, memcpy(new_name, name, namelen); table[0].procname = new_name; table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO; - init_header(&new->header, set->dir.header.root, set, node, table); + init_header(&new->header, set->dir.header.root, set, node, table, 1); return new; } @@ -1125,11 +1127,11 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table) return err; } -static int sysctl_check_table(const char *path, struct ctl_table *table) +static int sysctl_check_table(const char *path, struct ctl_table_header *header) { struct ctl_table *entry; int err = 0; - list_for_each_table_entry(entry, table) { + list_for_each_table_entry(entry, header) { if ((entry->proc_handler == proc_dostring) || (entry->proc_handler == proc_dobool) || (entry->proc_handler == proc_dointvec) || @@ -1159,8 +1161,7 @@ static int sysctl_check_table(const char *path, struct ctl_table *table) return err; } -static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table, - struct ctl_table_root *link_root) +static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_header *head) { struct ctl_table *link_table, *entry, *link; struct ctl_table_header *links; @@ -1170,7 +1171,7 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table name_bytes = 0; nr_entries = 0; - list_for_each_table_entry(entry, table) { + list_for_each_table_entry(entry, head) { nr_entries++; name_bytes += strlen(entry->procname) + 1; } @@ -1189,31 +1190,33 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table link_name = (char *)&link_table[nr_entries + 1]; link = link_table; - list_for_each_table_entry(entry, table) { + list_for_each_table_entry(entry, head) { int len = strlen(entry->procname) + 1; memcpy(link_name, entry->procname, len); link->procname = link_name; link->mode = S_IFLNK|S_IRWXUGO; - link->data = link_root; + link->data = head->root; link_name += len; link++; } - init_header(links, dir->header.root, dir->header.set, node, link_table); + init_header(links, dir->header.root, dir->header.set, node, link_table, + head->ctl_table_size); links->nreg = nr_entries; return links; } static bool get_links(struct ctl_dir *dir, - struct ctl_table *table, struct ctl_table_root *link_root) + struct ctl_table_header *header, + struct ctl_table_root *link_root) { - struct ctl_table_header *head; + struct ctl_table_header *tmp_head; struct ctl_table *entry, *link; /* Are there links available for every entry in table? */ - list_for_each_table_entry(entry, table) { + list_for_each_table_entry(entry, header) { const char *procname = entry->procname; - link = find_entry(&head, dir, procname, strlen(procname)); + link = find_entry(&tmp_head, dir, procname, strlen(procname)); if (!link) return false; if (S_ISDIR(link->mode) && S_ISDIR(entry->mode)) @@ -1224,10 +1227,10 @@ static bool get_links(struct ctl_dir *dir, } /* The checks passed. Increase the registration count on the links */ - list_for_each_table_entry(entry, table) { + list_for_each_table_entry(entry, header) { const char *procname = entry->procname; - link = find_entry(&head, dir, procname, strlen(procname)); - head->nreg++; + link = find_entry(&tmp_head, dir, procname, strlen(procname)); + tmp_head->nreg++; } return true; } @@ -1246,13 +1249,13 @@ static int insert_links(struct ctl_table_header *head) if (IS_ERR(core_parent)) return 0; - if (get_links(core_parent, head->ctl_table, head->root)) + if (get_links(core_parent, head, head->root)) return 0; core_parent->header.nreg++; spin_unlock(&sysctl_lock); - links = new_links(core_parent, head->ctl_table, head->root); + links = new_links(core_parent, head); spin_lock(&sysctl_lock); err = -ENOMEM; @@ -1260,7 +1263,7 @@ static int insert_links(struct ctl_table_header *head) goto out; err = 0; - if (get_links(core_parent, head->ctl_table, head->root)) { + if (get_links(core_parent, head, head->root)) { kfree(links); goto out; } @@ -1310,6 +1313,7 @@ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path) * should not be free'd after registration. So it should not be * used on stack. It can either be a global or dynamically allocated * by the caller and free'd later after sysctl unregistration. + * @table_size : The number of elements in table * * Register a sysctl table hierarchy. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. @@ -1352,26 +1356,21 @@ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path) */ struct ctl_table_header *__register_sysctl_table( struct ctl_table_set *set, - const char *path, struct ctl_table *table) + const char *path, struct ctl_table *table, size_t table_size) { struct ctl_table_root *root = set->dir.header.root; struct ctl_table_header *header; struct ctl_dir *dir; - struct ctl_table *entry; struct ctl_node *node; - int nr_entries = 0; - - list_for_each_table_entry(entry, table) - nr_entries++; header = kzalloc(sizeof(struct ctl_table_header) + - sizeof(struct ctl_node)*nr_entries, GFP_KERNEL_ACCOUNT); + sizeof(struct ctl_node)*table_size, GFP_KERNEL_ACCOUNT); if (!header) return NULL; node = (struct ctl_node *)(header + 1); - init_header(header, root, set, node, table); - if (sysctl_check_table(path, table)) + init_header(header, root, set, node, table, table_size); + if (sysctl_check_table(path, header)) goto fail; spin_lock(&sysctl_lock); @@ -1401,7 +1400,7 @@ fail: } /** - * register_sysctl - register a sysctl table + * register_sysctl_sz - register a sysctl table * @path: The path to the directory the sysctl table is in. If the path * doesn't exist we will create it for you. * @table: the table structure. The calller must ensure the life of the @table @@ -1411,18 +1410,20 @@ fail: * to call unregister_sysctl_table() and can instead use something like * register_sysctl_init() which does not care for the result of the syctl * registration. + * @table_size: The number of elements in table. * * Register a sysctl table. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. * * See __register_sysctl_table for more details. */ -struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table) +struct ctl_table_header *register_sysctl_sz(const char *path, struct ctl_table *table, + size_t table_size) { return __register_sysctl_table(&sysctl_table_root.default_set, - path, table); + path, table, table_size); } -EXPORT_SYMBOL(register_sysctl); +EXPORT_SYMBOL(register_sysctl_sz); /** * __register_sysctl_init() - register sysctl table to path @@ -1433,6 +1434,7 @@ EXPORT_SYMBOL(register_sysctl); * lifetime use of the sysctl. * @table_name: The name of sysctl table, only used for log printing when * registration fails + * @table_size: The number of elements in table * * The sysctl interface is used by userspace to query or modify at runtime * a predefined value set on a variable. These variables however have default @@ -1445,12 +1447,12 @@ EXPORT_SYMBOL(register_sysctl); * Context: if your base directory does not exist it will be created for you. */ void __init __register_sysctl_init(const char *path, struct ctl_table *table, - const char *table_name) + const char *table_name, size_t table_size) { - struct ctl_table_header *hdr = register_sysctl(path, table); + struct ctl_table_header *hdr = register_sysctl_sz(path, table, table_size); if (unlikely(!hdr)) { - pr_err("failed when register_sysctl %s to %s\n", table_name, path); + pr_err("failed when register_sysctl_sz %s to %s\n", table_name, path); return; } kmemleak_not_leak(hdr); @@ -1471,7 +1473,7 @@ static void put_links(struct ctl_table_header *header) if (IS_ERR(core_parent)) return; - list_for_each_table_entry(entry, header->ctl_table) { + list_for_each_table_entry(entry, header) { struct ctl_table_header *link_head; struct ctl_table *link; const char *name = entry->procname; @@ -1535,7 +1537,7 @@ void setup_sysctl_set(struct ctl_table_set *set, { memset(set, 0, sizeof(*set)); set->is_seen = is_seen; - init_header(&set->dir.header, root, set, NULL, root_table); + init_header(&set->dir.header, root, set, NULL, root_table, 1); } void retire_sysctl_set(struct ctl_table_set *set) diff --git a/fs/proc/root.c b/fs/proc/root.c index a86e65a608da..9191248f2dac 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -314,7 +314,8 @@ static int proc_root_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry), + stat); stat->nlink = proc_root.nlink + nr_processes(); return 0; } diff --git a/fs/proc/self.c b/fs/proc/self.c index 72cd69bcaf4a..ecc4da8d265e 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -46,7 +46,7 @@ int proc_setup_self(struct super_block *s) struct inode *inode = new_inode(s); if (inode) { inode->i_ino = self_inum; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 507cd4e59d07..3dd5be96691b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -4,6 +4,7 @@ #include <linux/hugetlb.h> #include <linux/huge_mm.h> #include <linux/mount.h> +#include <linux/ksm.h> #include <linux/seq_file.h> #include <linux/highmem.h> #include <linux/ptrace.h> @@ -236,21 +237,6 @@ static int do_maps_open(struct inode *inode, struct file *file, sizeof(struct proc_maps_private)); } -/* - * Indicate if the VMA is a stack for the given task; for - * /proc/PID/maps that is the stack of the main task. - */ -static int is_stack(struct vm_area_struct *vma) -{ - /* - * We make no effort to guess what a given thread considers to be - * its "stack". It's not even well-defined for programs written - * languages like Go. - */ - return vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack; -} - static void show_vma_header_prefix(struct seq_file *m, unsigned long start, unsigned long end, vm_flags_t flags, unsigned long long pgoff, @@ -327,13 +313,12 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) goto done; } - if (vma->vm_start <= mm->brk && - vma->vm_end >= mm->start_brk) { + if (vma_is_initial_heap(vma)) { name = "[heap]"; goto done; } - if (is_stack(vma)) { + if (vma_is_initial_stack(vma)) { name = "[stack]"; goto done; } @@ -412,6 +397,7 @@ struct mem_size_stats { unsigned long swap; unsigned long shared_hugetlb; unsigned long private_hugetlb; + unsigned long ksm; u64 pss; u64 pss_anon; u64 pss_file; @@ -468,6 +454,9 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, mss->lazyfree += size; } + if (PageKsm(page)) + mss->ksm += size; + mss->resident += size; /* Accumulate the size in pages that have been accessed. */ if (young || page_is_young(page) || PageReferenced(page)) @@ -587,8 +576,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, bool migration = false; if (pmd_present(*pmd)) { - /* FOLL_DUMP will return -EFAULT on huge zero page */ - page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); + page = vm_normal_page_pmd(vma, addr, *pmd); } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { swp_entry_t entry = pmd_to_swp_entry(*pmd); @@ -709,6 +697,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR [ilog2(VM_UFFD_MINOR)] = "ui", #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ +#ifdef CONFIG_X86_USER_SHADOW_STACK + [ilog2(VM_SHADOW_STACK)] = "ss", +#endif }; size_t i; @@ -758,12 +749,14 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, static const struct mm_walk_ops smaps_walk_ops = { .pmd_entry = smaps_pte_range, .hugetlb_entry = smaps_hugetlb_range, + .walk_lock = PGWALK_RDLOCK, }; static const struct mm_walk_ops smaps_shmem_walk_ops = { .pmd_entry = smaps_pte_range, .hugetlb_entry = smaps_hugetlb_range, .pte_hole = smaps_pte_hole, + .walk_lock = PGWALK_RDLOCK, }; /* @@ -837,6 +830,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); + SEQ_PUT_DEC(" kB\nKSM: ", mss->ksm); SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); @@ -870,7 +864,7 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); - seq_printf(m, "THPeligible: %d\n", + seq_printf(m, "THPeligible: %8u\n", hugepage_vma_check(vma, vma->vm_flags, true, false, true)); if (arch_pkeys_enabled()) @@ -1245,6 +1239,7 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end, static const struct mm_walk_ops clear_refs_walk_ops = { .pmd_entry = clear_refs_pte_range, .test_walk = clear_refs_test_walk, + .walk_lock = PGWALK_WRLOCK, }; static ssize_t clear_refs_write(struct file *file, const char __user *buf, @@ -1622,6 +1617,7 @@ static const struct mm_walk_ops pagemap_ops = { .pmd_entry = pagemap_pmd_range, .pte_hole = pagemap_pte_hole, .hugetlb_entry = pagemap_hugetlb_range, + .walk_lock = PGWALK_RDLOCK, }; /* @@ -1935,6 +1931,7 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, static const struct mm_walk_ops show_numa_ops = { .hugetlb_entry = gather_hugetlb_stats, .pmd_entry = gather_pte_stats, + .walk_lock = PGWALK_RDLOCK, }; /* @@ -1971,9 +1968,9 @@ static int show_numa_map(struct seq_file *m, void *v) if (file) { seq_puts(m, " file="); seq_file_path(m, file, "\n\t= "); - } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + } else if (vma_is_initial_heap(vma)) { seq_puts(m, " heap"); - } else if (is_stack(vma)) { + } else if (vma_is_initial_stack(vma)) { seq_puts(m, " stack"); } diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 2c8b62265981..7cebd397cc26 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -121,19 +121,6 @@ unsigned long task_statm(struct mm_struct *mm, return size; } -static int is_stack(struct vm_area_struct *vma) -{ - struct mm_struct *mm = vma->vm_mm; - - /* - * We make no effort to guess what a given thread considers to be - * its "stack". It's not even well-defined for programs written - * languages like Go. - */ - return vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack; -} - /* * display a single VMA to a sequenced file */ @@ -171,7 +158,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) if (file) { seq_pad(m, ' '); seq_file_path(m, file, ""); - } else if (mm && is_stack(vma)) { + } else if (mm && vma_is_initial_stack(vma)) { seq_pad(m, ' '); seq_puts(m, "[stack]"); } @@ -188,15 +175,28 @@ static int show_map(struct seq_file *m, void *_p) return nommu_vma_show(m, _p); } -static void *m_start(struct seq_file *m, loff_t *pos) +static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, + loff_t *ppos) +{ + struct vm_area_struct *vma = vma_next(&priv->iter); + + if (vma) { + *ppos = vma->vm_start; + } else { + *ppos = -1UL; + } + + return vma; +} + +static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; + unsigned long last_addr = *ppos; struct mm_struct *mm; - struct vm_area_struct *vma; - unsigned long addr = *pos; - /* See m_next(). Zero at the start or after lseek. */ - if (addr == -1UL) + /* See proc_get_vma(). Zero at the start or after lseek. */ + if (last_addr == -1UL) return NULL; /* pin the task and mm whilst we play with them */ @@ -205,44 +205,41 @@ static void *m_start(struct seq_file *m, loff_t *pos) return ERR_PTR(-ESRCH); mm = priv->mm; - if (!mm || !mmget_not_zero(mm)) + if (!mm || !mmget_not_zero(mm)) { + put_task_struct(priv->task); + priv->task = NULL; return NULL; + } if (mmap_read_lock_killable(mm)) { mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; return ERR_PTR(-EINTR); } - /* start the next element from addr */ - vma = find_vma(mm, addr); - if (vma) - return vma; + vma_iter_init(&priv->iter, mm, last_addr); - mmap_read_unlock(mm); - mmput(mm); - return NULL; + return proc_get_vma(priv, ppos); } -static void m_stop(struct seq_file *m, void *_vml) +static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; + struct mm_struct *mm = priv->mm; - if (!IS_ERR_OR_NULL(_vml)) { - mmap_read_unlock(priv->mm); - mmput(priv->mm); - } - if (priv->task) { - put_task_struct(priv->task); - priv->task = NULL; - } + if (!priv->task) + return; + + mmap_read_unlock(mm); + mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; } -static void *m_next(struct seq_file *m, void *_p, loff_t *pos) +static void *m_next(struct seq_file *m, void *_p, loff_t *ppos) { - struct vm_area_struct *vma = _p; - - *pos = vma->vm_end; - return find_vma(vma->vm_mm, vma->vm_end); + return proc_get_vma(m->private, ppos); } static const struct seq_operations proc_pid_maps_ops = { diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index a553273fbd41..63ac1f93289f 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -46,7 +46,7 @@ int proc_setup_thread_self(struct super_block *s) struct inode *inode = new_inode(s); if (inode) { inode->i_ino = thread_self_inum; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index c49d554cc9ae..3acc38600cd1 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only config PSTORE tristate "Persistent store support" - select CRYPTO if PSTORE_COMPRESS default n help This option enables generic access to platform level @@ -22,99 +21,18 @@ config PSTORE_DEFAULT_KMSG_BYTES Defines default size of pstore kernel log storage. Can be enlarged if needed, not recommended to shrink it. -config PSTORE_DEFLATE_COMPRESS - tristate "DEFLATE (ZLIB) compression" - default y - depends on PSTORE - select CRYPTO_DEFLATE - help - This option enables DEFLATE (also known as ZLIB) compression - algorithm support. - -config PSTORE_LZO_COMPRESS - tristate "LZO compression" - depends on PSTORE - select CRYPTO_LZO - help - This option enables LZO compression algorithm support. - -config PSTORE_LZ4_COMPRESS - tristate "LZ4 compression" - depends on PSTORE - select CRYPTO_LZ4 - help - This option enables LZ4 compression algorithm support. - -config PSTORE_LZ4HC_COMPRESS - tristate "LZ4HC compression" - depends on PSTORE - select CRYPTO_LZ4HC - help - This option enables LZ4HC (high compression) mode algorithm. - -config PSTORE_842_COMPRESS - bool "842 compression" - depends on PSTORE - select CRYPTO_842 - help - This option enables 842 compression algorithm support. - -config PSTORE_ZSTD_COMPRESS - bool "zstd compression" - depends on PSTORE - select CRYPTO_ZSTD - help - This option enables zstd compression algorithm support. - config PSTORE_COMPRESS - def_bool y + bool "Pstore compression (deflate)" depends on PSTORE - depends on PSTORE_DEFLATE_COMPRESS || PSTORE_LZO_COMPRESS || \ - PSTORE_LZ4_COMPRESS || PSTORE_LZ4HC_COMPRESS || \ - PSTORE_842_COMPRESS || PSTORE_ZSTD_COMPRESS - -choice - prompt "Default pstore compression algorithm" - depends on PSTORE_COMPRESS + select ZLIB_INFLATE + select ZLIB_DEFLATE + default y help - This option chooses the default active compression algorithm. - This change be changed at boot with "pstore.compress=..." on - the kernel command line. - - Currently, pstore has support for 6 compression algorithms: - deflate, lzo, lz4, lz4hc, 842 and zstd. - - The default compression algorithm is deflate. - - config PSTORE_DEFLATE_COMPRESS_DEFAULT - bool "deflate" if PSTORE_DEFLATE_COMPRESS - - config PSTORE_LZO_COMPRESS_DEFAULT - bool "lzo" if PSTORE_LZO_COMPRESS - - config PSTORE_LZ4_COMPRESS_DEFAULT - bool "lz4" if PSTORE_LZ4_COMPRESS - - config PSTORE_LZ4HC_COMPRESS_DEFAULT - bool "lz4hc" if PSTORE_LZ4HC_COMPRESS - - config PSTORE_842_COMPRESS_DEFAULT - bool "842" if PSTORE_842_COMPRESS - - config PSTORE_ZSTD_COMPRESS_DEFAULT - bool "zstd" if PSTORE_ZSTD_COMPRESS - -endchoice - -config PSTORE_COMPRESS_DEFAULT - string - depends on PSTORE_COMPRESS - default "deflate" if PSTORE_DEFLATE_COMPRESS_DEFAULT - default "lzo" if PSTORE_LZO_COMPRESS_DEFAULT - default "lz4" if PSTORE_LZ4_COMPRESS_DEFAULT - default "lz4hc" if PSTORE_LZ4HC_COMPRESS_DEFAULT - default "842" if PSTORE_842_COMPRESS_DEFAULT - default "zstd" if PSTORE_ZSTD_COMPRESS_DEFAULT + Whether pstore records should be compressed before being written to + the backing store. This is implemented using the zlib 'deflate' + algorithm, using the library implementation instead of using the full + blown crypto API. This reduces the risk of secondary oopses or other + problems while pstore is recording panic metadata. config PSTORE_CONSOLE bool "Log kernel console messages" diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index ffbadb8b3032..585360706b33 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -54,7 +54,7 @@ static void free_pstore_private(struct pstore_private *private) if (!private) return; if (private->record) { - kfree(private->record->buf); + kvfree(private->record->buf); kfree(private->record->priv); kfree(private->record); } @@ -223,7 +223,7 @@ static struct inode *pstore_get_inode(struct super_block *sb) struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); } return inode; } @@ -390,7 +390,7 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) inode->i_private = private; if (record->time.tv_sec) - inode->i_mtime = inode->i_ctime = record->time; + inode->i_mtime = inode_set_ctime_to_ts(inode, record->time); d_add(dentry, inode); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index cbc0b468c1ab..e5bca9a004cc 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -14,24 +14,17 @@ #include <linux/init.h> #include <linux/kmsg_dump.h> #include <linux/console.h> +#include <linux/mm.h> #include <linux/module.h> #include <linux/pstore.h> -#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS) -#include <linux/lzo.h> -#endif -#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS) || IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS) -#include <linux/lz4.h> -#endif -#if IS_ENABLED(CONFIG_PSTORE_ZSTD_COMPRESS) -#include <linux/zstd.h> -#endif -#include <linux/crypto.h> #include <linux/string.h> #include <linux/timer.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/jiffies.h> +#include <linux/vmalloc.h> #include <linux/workqueue.h> +#include <linux/zlib.h> #include "internal.h" @@ -80,12 +73,21 @@ static char *backend; module_param(backend, charp, 0444); MODULE_PARM_DESC(backend, "specific backend to use"); -static char *compress = -#ifdef CONFIG_PSTORE_COMPRESS_DEFAULT - CONFIG_PSTORE_COMPRESS_DEFAULT; -#else - NULL; -#endif +/* + * pstore no longer implements compression via the crypto API, and only + * supports zlib deflate compression implemented using the zlib library + * interface. This removes additional complexity which is hard to justify for a + * diagnostic facility that has to operate in conditions where the system may + * have become unstable. Zlib deflate is comparatively small in terms of code + * size, and compresses ASCII text comparatively well. In terms of compression + * speed, deflate is not the best performer but for recording the log output on + * a kernel panic, this is not considered critical. + * + * The only remaining arguments supported by the compress= module parameter are + * 'deflate' and 'none'. To retain compatibility with existing installations, + * all other values are logged and replaced with 'deflate'. + */ +static char *compress = "deflate"; module_param(compress, charp, 0444); MODULE_PARM_DESC(compress, "compression to use"); @@ -94,16 +96,16 @@ unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; module_param(kmsg_bytes, ulong, 0444); MODULE_PARM_DESC(kmsg_bytes, "amount of kernel log to snapshot (in bytes)"); -/* Compression parameters */ -static struct crypto_comp *tfm; +static void *compress_workspace; -struct pstore_zbackend { - int (*zbufsize)(size_t size); - const char *name; -}; +/* + * Compression is only used for dmesg output, which consists of low-entropy + * ASCII text, and so we can assume worst-case 60%. + */ +#define DMESG_COMP_PERCENT 60 static char *big_oops_buf; -static size_t big_oops_buf_sz; +static size_t max_compressed_size; void pstore_set_kmsg_bytes(int bytes) { @@ -168,206 +170,93 @@ static bool pstore_cannot_block_path(enum kmsg_dump_reason reason) } } -#if IS_ENABLED(CONFIG_PSTORE_DEFLATE_COMPRESS) -static int zbufsize_deflate(size_t size) -{ - size_t cmpr; - - switch (size) { - /* buffer range for efivars */ - case 1000 ... 2000: - cmpr = 56; - break; - case 2001 ... 3000: - cmpr = 54; - break; - case 3001 ... 3999: - cmpr = 52; - break; - /* buffer range for nvram, erst */ - case 4000 ... 10000: - cmpr = 45; - break; - default: - cmpr = 60; - break; - } - - return (size * 100) / cmpr; -} -#endif - -#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS) -static int zbufsize_lzo(size_t size) -{ - return lzo1x_worst_compress(size); -} -#endif - -#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS) || IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS) -static int zbufsize_lz4(size_t size) -{ - return LZ4_compressBound(size); -} -#endif - -#if IS_ENABLED(CONFIG_PSTORE_842_COMPRESS) -static int zbufsize_842(size_t size) -{ - return size; -} -#endif - -#if IS_ENABLED(CONFIG_PSTORE_ZSTD_COMPRESS) -static int zbufsize_zstd(size_t size) -{ - return zstd_compress_bound(size); -} -#endif - -static const struct pstore_zbackend *zbackend __ro_after_init; - -static const struct pstore_zbackend zbackends[] = { -#if IS_ENABLED(CONFIG_PSTORE_DEFLATE_COMPRESS) - { - .zbufsize = zbufsize_deflate, - .name = "deflate", - }, -#endif -#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS) - { - .zbufsize = zbufsize_lzo, - .name = "lzo", - }, -#endif -#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS) - { - .zbufsize = zbufsize_lz4, - .name = "lz4", - }, -#endif -#if IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS) - { - .zbufsize = zbufsize_lz4, - .name = "lz4hc", - }, -#endif -#if IS_ENABLED(CONFIG_PSTORE_842_COMPRESS) - { - .zbufsize = zbufsize_842, - .name = "842", - }, -#endif -#if IS_ENABLED(CONFIG_PSTORE_ZSTD_COMPRESS) - { - .zbufsize = zbufsize_zstd, - .name = "zstd", - }, -#endif - { } -}; - static int pstore_compress(const void *in, void *out, unsigned int inlen, unsigned int outlen) { + struct z_stream_s zstream = { + .next_in = in, + .avail_in = inlen, + .next_out = out, + .avail_out = outlen, + .workspace = compress_workspace, + }; int ret; if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS)) return -EINVAL; - ret = crypto_comp_compress(tfm, in, inlen, out, &outlen); - if (ret) { - pr_err("crypto_comp_compress failed, ret = %d!\n", ret); - return ret; - } + ret = zlib_deflateInit2(&zstream, Z_DEFAULT_COMPRESSION, Z_DEFLATED, + -MAX_WBITS, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY); + if (ret != Z_OK) + return -EINVAL; - return outlen; + ret = zlib_deflate(&zstream, Z_FINISH); + if (ret != Z_STREAM_END) + return -EINVAL; + + ret = zlib_deflateEnd(&zstream); + if (ret != Z_OK) + pr_warn_once("zlib_deflateEnd() failed: %d\n", ret); + + return zstream.total_out; } static void allocate_buf_for_compression(void) { - struct crypto_comp *ctx; - int size; + size_t compressed_size; char *buf; - /* Skip if not built-in or compression backend not selected yet. */ - if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !zbackend) - return; - - /* Skip if no pstore backend yet or compression init already done. */ - if (!psinfo || tfm) - return; - - if (!crypto_has_comp(zbackend->name, 0, 0)) { - pr_err("Unknown compression: %s\n", zbackend->name); + /* Skip if not built-in or compression disabled. */ + if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !compress || + !strcmp(compress, "none")) { + compress = NULL; return; } - size = zbackend->zbufsize(psinfo->bufsize); - if (size <= 0) { - pr_err("Invalid compression size for %s: %d\n", - zbackend->name, size); - return; + if (strcmp(compress, "deflate")) { + pr_err("Unsupported compression '%s', falling back to deflate\n", + compress); + compress = "deflate"; } - buf = kmalloc(size, GFP_KERNEL); + /* + * The compression buffer only needs to be as large as the maximum + * uncompressed record size, since any record that would be expanded by + * compression is just stored uncompressed. + */ + compressed_size = (psinfo->bufsize * 100) / DMESG_COMP_PERCENT; + buf = kvzalloc(compressed_size, GFP_KERNEL); if (!buf) { - pr_err("Failed %d byte compression buffer allocation for: %s\n", - size, zbackend->name); + pr_err("Failed %zu byte compression buffer allocation for: %s\n", + psinfo->bufsize, compress); return; } - ctx = crypto_alloc_comp(zbackend->name, 0, 0); - if (IS_ERR_OR_NULL(ctx)) { - kfree(buf); - pr_err("crypto_alloc_comp('%s') failed: %ld\n", zbackend->name, - PTR_ERR(ctx)); + compress_workspace = + vmalloc(zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL)); + if (!compress_workspace) { + pr_err("Failed to allocate zlib deflate workspace\n"); + kvfree(buf); return; } /* A non-NULL big_oops_buf indicates compression is available. */ - tfm = ctx; - big_oops_buf_sz = size; big_oops_buf = buf; + max_compressed_size = compressed_size; - pr_info("Using crash dump compression: %s\n", zbackend->name); + pr_info("Using crash dump compression: %s\n", compress); } static void free_buf_for_compression(void) { - if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm) { - crypto_free_comp(tfm); - tfm = NULL; + if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && compress_workspace) { + vfree(compress_workspace); + compress_workspace = NULL; } - kfree(big_oops_buf); - big_oops_buf = NULL; - big_oops_buf_sz = 0; -} -/* - * Called when compression fails, since the printk buffer - * would be fetched for compression calling it again when - * compression fails would have moved the iterator of - * printk buffer which results in fetching old contents. - * Copy the recent messages from big_oops_buf to psinfo->buf - */ -static size_t copy_kmsg_to_buffer(int hsize, size_t len) -{ - size_t total_len; - size_t diff; - - total_len = hsize + len; - - if (total_len > psinfo->bufsize) { - diff = total_len - psinfo->bufsize + hsize; - memcpy(psinfo->buf, big_oops_buf, hsize); - memcpy(psinfo->buf + hsize, big_oops_buf + diff, - psinfo->bufsize - hsize); - total_len = psinfo->bufsize; - } else - memcpy(psinfo->buf, big_oops_buf, total_len); - - return total_len; + kvfree(big_oops_buf); + big_oops_buf = NULL; + max_compressed_size = 0; } void pstore_record_init(struct pstore_record *record, @@ -426,13 +315,8 @@ static void pstore_dump(struct kmsg_dumper *dumper, record.part = part; record.buf = psinfo->buf; - if (big_oops_buf) { - dst = big_oops_buf; - dst_size = big_oops_buf_sz; - } else { - dst = psinfo->buf; - dst_size = psinfo->bufsize; - } + dst = big_oops_buf ?: psinfo->buf; + dst_size = max_compressed_size ?: psinfo->bufsize; /* Write dump header. */ header_size = snprintf(dst, dst_size, "%s#%d Part%u\n", why, @@ -453,8 +337,15 @@ static void pstore_dump(struct kmsg_dumper *dumper, record.compressed = true; record.size = zipped_len; } else { - record.size = copy_kmsg_to_buffer(header_size, - dump_size); + /* + * Compression failed, so the buffer is most + * likely filled with binary data that does not + * compress as well as ASCII text. Copy as much + * of the uncompressed data as possible into + * the pstore record, and discard the rest. + */ + record.size = psinfo->bufsize; + memcpy(psinfo->buf, dst, psinfo->bufsize); } } else { record.size = header_size + dump_size; @@ -549,7 +440,7 @@ static int pstore_write_user_compat(struct pstore_record *record, if (record->buf) return -EINVAL; - record->buf = memdup_user(buf, record->size); + record->buf = vmemdup_user(buf, record->size); if (IS_ERR(record->buf)) { ret = PTR_ERR(record->buf); goto out; @@ -557,7 +448,7 @@ static int pstore_write_user_compat(struct pstore_record *record, ret = record->psi->write(record); - kfree(record->buf); + kvfree(record->buf); out: record->buf = NULL; @@ -681,11 +572,13 @@ void pstore_unregister(struct pstore_info *psi) } EXPORT_SYMBOL_GPL(pstore_unregister); -static void decompress_record(struct pstore_record *record) +static void decompress_record(struct pstore_record *record, + struct z_stream_s *zstream) { int ret; int unzipped_len; char *unzipped, *workspace; + size_t max_uncompressed_size; if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !record->compressed) return; @@ -697,40 +590,51 @@ static void decompress_record(struct pstore_record *record) } /* Missing compression buffer means compression was not initialized. */ - if (!big_oops_buf) { + if (!zstream->workspace) { pr_warn("no decompression method initialized!\n"); return; } + ret = zlib_inflateReset(zstream); + if (ret != Z_OK) { + pr_err("zlib_inflateReset() failed, ret = %d!\n", ret); + return; + } + /* Allocate enough space to hold max decompression and ECC. */ - unzipped_len = big_oops_buf_sz; - workspace = kmalloc(unzipped_len + record->ecc_notice_size, - GFP_KERNEL); + max_uncompressed_size = 3 * psinfo->bufsize; + workspace = kvzalloc(max_uncompressed_size + record->ecc_notice_size, + GFP_KERNEL); if (!workspace) return; - /* After decompression "unzipped_len" is almost certainly smaller. */ - ret = crypto_comp_decompress(tfm, record->buf, record->size, - workspace, &unzipped_len); - if (ret) { - pr_err("crypto_comp_decompress failed, ret = %d!\n", ret); - kfree(workspace); + zstream->next_in = record->buf; + zstream->avail_in = record->size; + zstream->next_out = workspace; + zstream->avail_out = max_uncompressed_size; + + ret = zlib_inflate(zstream, Z_FINISH); + if (ret != Z_STREAM_END) { + pr_err_ratelimited("zlib_inflate() failed, ret = %d!\n", ret); + kvfree(workspace); return; } + unzipped_len = zstream->total_out; + /* Append ECC notice to decompressed buffer. */ memcpy(workspace + unzipped_len, record->buf + record->size, record->ecc_notice_size); /* Copy decompressed contents into an minimum-sized allocation. */ - unzipped = kmemdup(workspace, unzipped_len + record->ecc_notice_size, - GFP_KERNEL); - kfree(workspace); + unzipped = kvmemdup(workspace, unzipped_len + record->ecc_notice_size, + GFP_KERNEL); + kvfree(workspace); if (!unzipped) return; /* Swap out compressed contents with decompressed contents. */ - kfree(record->buf); + kvfree(record->buf); record->buf = unzipped; record->size = unzipped_len; record->compressed = false; @@ -747,10 +651,17 @@ void pstore_get_backend_records(struct pstore_info *psi, { int failed = 0; unsigned int stop_loop = 65536; + struct z_stream_s zstream = {}; if (!psi || !root) return; + if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && compress) { + zstream.workspace = kvmalloc(zlib_inflate_workspacesize(), + GFP_KERNEL); + zlib_inflateInit2(&zstream, -DEF_WBITS); + } + mutex_lock(&psi->read_mutex); if (psi->open && psi->open(psi)) goto out; @@ -779,11 +690,11 @@ void pstore_get_backend_records(struct pstore_info *psi, break; } - decompress_record(record); + decompress_record(record, &zstream); rc = pstore_mkfile(root, record); if (rc) { /* pstore_mkfile() did not take record, so free it. */ - kfree(record->buf); + kvfree(record->buf); kfree(record->priv); kfree(record); if (rc != -EEXIST || !quiet) @@ -795,6 +706,12 @@ void pstore_get_backend_records(struct pstore_info *psi, out: mutex_unlock(&psi->read_mutex); + if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && compress) { + if (zlib_inflateEnd(&zstream) != Z_OK) + pr_warn("zlib_inflateEnd() failed\n"); + kvfree(zstream.workspace); + } + if (failed) pr_warn("failed to create %d record(s) from '%s'\n", failed, psi->name); @@ -818,34 +735,10 @@ static void pstore_timefunc(struct timer_list *unused) pstore_timer_kick(); } -static void __init pstore_choose_compression(void) -{ - const struct pstore_zbackend *step; - - if (!compress) - return; - - for (step = zbackends; step->name; step++) { - if (!strcmp(compress, step->name)) { - zbackend = step; - return; - } - } -} - static int __init pstore_init(void) { int ret; - pstore_choose_compression(); - - /* - * Check if any pstore backends registered earlier but did not - * initialize compression because crypto was not ready. If so, - * initialize compression now. - */ - allocate_buf_for_compression(); - ret = pstore_init_fs(); if (ret) free_buf_for_compression(); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 2f625e1fa8d8..d36702c7ab3c 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -20,6 +20,7 @@ #include <linux/compiler.h> #include <linux/of.h> #include <linux/of_address.h> +#include <linux/mm.h> #include "internal.h" #include "ram_internal.h" @@ -268,7 +269,7 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record) /* ECC correction notice */ record->ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0); - record->buf = kmalloc(size + record->ecc_notice_size + 1, GFP_KERNEL); + record->buf = kvzalloc(size + record->ecc_notice_size + 1, GFP_KERNEL); if (record->buf == NULL) { size = -ENOMEM; goto out; @@ -282,7 +283,7 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record) out: if (free_prz) { - kfree(prz->old_log); + kvfree(prz->old_log); kfree(prz); } @@ -833,7 +834,7 @@ static int ramoops_probe(struct platform_device *pdev) */ if (cxt->pstore.flags & PSTORE_FLAGS_DMESG) { cxt->pstore.bufsize = cxt->dprzs[0]->buffer_size; - cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL); + cxt->pstore.buf = kvzalloc(cxt->pstore.bufsize, GFP_KERNEL); if (!cxt->pstore.buf) { pr_err("cannot allocate pstore crash dump buffer\n"); err = -ENOMEM; @@ -866,7 +867,7 @@ static int ramoops_probe(struct platform_device *pdev) return 0; fail_buf: - kfree(cxt->pstore.buf); + kvfree(cxt->pstore.buf); fail_clear: cxt->pstore.bufsize = 0; fail_init: @@ -881,7 +882,7 @@ static void ramoops_remove(struct platform_device *pdev) pstore_unregister(&cxt->pstore); - kfree(cxt->pstore.buf); + kvfree(cxt->pstore.buf); cxt->pstore.bufsize = 0; ramoops_free_przs(cxt); diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 85aaf0fc6d7d..650e437b55e6 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> +#include <linux/mm.h> #include <asm/page.h> #include "ram_internal.h" @@ -24,12 +25,10 @@ /** * struct persistent_ram_buffer - persistent circular RAM buffer * - * @sig: - * signature to indicate header (PERSISTENT_RAM_SIG xor PRZ-type value) - * @start: - * offset into @data where the beginning of the stored bytes begin - * @size: - * number of valid bytes stored in @data + * @sig: Signature to indicate header (PERSISTENT_RAM_SIG xor PRZ-type value) + * @start: First valid byte in the buffer. + * @size: Number of valid bytes in the buffer. + * @data: The contents of the buffer. */ struct persistent_ram_buffer { uint32_t sig; @@ -301,7 +300,7 @@ void persistent_ram_save_old(struct persistent_ram_zone *prz) if (!prz->old_log) { persistent_ram_ecc_old(prz); - prz->old_log = kmalloc(size, GFP_KERNEL); + prz->old_log = kvzalloc(size, GFP_KERNEL); } if (!prz->old_log) { pr_err("failed to allocate buffer\n"); @@ -385,7 +384,7 @@ void *persistent_ram_old(struct persistent_ram_zone *prz) void persistent_ram_free_old(struct persistent_ram_zone *prz) { - kfree(prz->old_log); + kvfree(prz->old_log); prz->old_log = NULL; prz->old_log_size = 0; } @@ -519,7 +518,7 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, sig ^= PERSISTENT_RAM_SIG; if (prz->buffer->sig == sig) { - if (buffer_size(prz) == 0) { + if (buffer_size(prz) == 0 && buffer_start(prz) == 0) { pr_debug("found existing empty buffer\n"); return 0; } diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig index 45b5b98376c4..a2eb826e76c6 100644 --- a/fs/qnx4/Kconfig +++ b/fs/qnx4/Kconfig @@ -2,6 +2,7 @@ config QNX4FS_FS tristate "QNX4 file system support (read only)" depends on BLOCK + select BUFFER_HEAD help This is the file system used by the real-time operating systems QNX 4 and QNX 6 (the latter is also called QNX RTP). diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 391ea402920d..a7171f5532a1 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -305,8 +305,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino) inode->i_mtime.tv_nsec = 0; inode->i_atime.tv_sec = le32_to_cpu(raw_inode->di_atime); inode->i_atime.tv_nsec = 0; - inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->di_ctime); - inode->i_ctime.tv_nsec = 0; + inode_set_ctime(inode, le32_to_cpu(raw_inode->di_ctime), 0); inode->i_blocks = le32_to_cpu(raw_inode->di_first_xtnt.xtnt_size); memcpy(qnx4_inode, raw_inode, QNX4_DIR_ENTRY_SIZE); diff --git a/fs/qnx6/Kconfig b/fs/qnx6/Kconfig index 6a9d6bce1586..8e865d72204e 100644 --- a/fs/qnx6/Kconfig +++ b/fs/qnx6/Kconfig @@ -2,6 +2,7 @@ config QNX6FS_FS tristate "QNX6 file system support (read only)" depends on BLOCK && CRC32 + select BUFFER_HEAD help This is the file system used by the real-time operating systems QNX 6 (also called QNX RTP). diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 85b2fa3b211c..21f90d519f1a 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -562,8 +562,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) inode->i_mtime.tv_nsec = 0; inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_atime); inode->i_atime.tv_nsec = 0; - inode->i_ctime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_ctime); - inode->i_ctime.tv_nsec = 0; + inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->di_ctime), 0); /* calc blocks based on 512 byte blocksize */ inode->i_blocks = (inode->i_size + 511) >> 9; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index e3e4f4047657..9e72bfe8bbad 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -225,13 +225,22 @@ static void put_quota_format(struct quota_format_type *fmt) /* * Dquot List Management: - * The quota code uses four lists for dquot management: the inuse_list, - * free_dquots, dqi_dirty_list, and dquot_hash[] array. A single dquot - * structure may be on some of those lists, depending on its current state. + * The quota code uses five lists for dquot management: the inuse_list, + * releasing_dquots, free_dquots, dqi_dirty_list, and dquot_hash[] array. + * A single dquot structure may be on some of those lists, depending on + * its current state. * * All dquots are placed to the end of inuse_list when first created, and this * list is used for invalidate operation, which must look at every dquot. * + * When the last reference of a dquot will be dropped, the dquot will be + * added to releasing_dquots. We'd then queue work item which would call + * synchronize_srcu() and after that perform the final cleanup of all the + * dquots on the list. Both releasing_dquots and free_dquots use the + * dq_free list_head in the dquot struct. When a dquot is removed from + * releasing_dquots, a reference count is always subtracted, and if + * dq_count == 0 at that point, the dquot will be added to the free_dquots. + * * Unused dquots (dq_count == 0) are added to the free_dquots list when freed, * and this list is searched whenever we need an available dquot. Dquots are * removed from the list as soon as they are used again, and @@ -250,6 +259,7 @@ static void put_quota_format(struct quota_format_type *fmt) static LIST_HEAD(inuse_list); static LIST_HEAD(free_dquots); +static LIST_HEAD(releasing_dquots); static unsigned int dq_hash_bits, dq_hash_mask; static struct hlist_head *dquot_hash; @@ -260,6 +270,9 @@ static qsize_t inode_get_rsv_space(struct inode *inode); static qsize_t __inode_get_rsv_space(struct inode *inode); static int __dquot_initialize(struct inode *inode, int type); +static void quota_release_workfn(struct work_struct *work); +static DECLARE_DELAYED_WORK(quota_release_work, quota_release_workfn); + static inline unsigned int hashfn(const struct super_block *sb, struct kqid qid) { @@ -305,12 +318,18 @@ static inline void put_dquot_last(struct dquot *dquot) dqstats_inc(DQST_FREE_DQUOTS); } +static inline void put_releasing_dquots(struct dquot *dquot) +{ + list_add_tail(&dquot->dq_free, &releasing_dquots); +} + static inline void remove_free_dquot(struct dquot *dquot) { if (list_empty(&dquot->dq_free)) return; list_del_init(&dquot->dq_free); - dqstats_dec(DQST_FREE_DQUOTS); + if (!atomic_read(&dquot->dq_count)) + dqstats_dec(DQST_FREE_DQUOTS); } static inline void put_inuse(struct dquot *dquot) @@ -336,6 +355,11 @@ static void wait_on_dquot(struct dquot *dquot) mutex_unlock(&dquot->dq_lock); } +static inline int dquot_active(struct dquot *dquot) +{ + return test_bit(DQ_ACTIVE_B, &dquot->dq_flags); +} + static inline int dquot_dirty(struct dquot *dquot) { return test_bit(DQ_MOD_B, &dquot->dq_flags); @@ -351,14 +375,14 @@ int dquot_mark_dquot_dirty(struct dquot *dquot) { int ret = 1; - if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) + if (!dquot_active(dquot)) return 0; if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY) return test_and_set_bit(DQ_MOD_B, &dquot->dq_flags); /* If quota is dirty already, we don't have to acquire dq_list_lock */ - if (test_bit(DQ_MOD_B, &dquot->dq_flags)) + if (dquot_dirty(dquot)) return 1; spin_lock(&dq_list_lock); @@ -440,7 +464,7 @@ int dquot_acquire(struct dquot *dquot) smp_mb__before_atomic(); set_bit(DQ_READ_B, &dquot->dq_flags); /* Instantiate dquot if needed */ - if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && !dquot->dq_off) { + if (!dquot_active(dquot) && !dquot->dq_off) { ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot); /* Write the info if needed */ if (info_dirty(&dqopt->info[dquot->dq_id.type])) { @@ -482,7 +506,7 @@ int dquot_commit(struct dquot *dquot) goto out_lock; /* Inactive dquot can be only if there was error during read/init * => we have better not writing it */ - if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) + if (dquot_active(dquot)) ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot); else ret = -EIO; @@ -547,6 +571,8 @@ static void invalidate_dquots(struct super_block *sb, int type) struct dquot *dquot, *tmp; restart: + flush_delayed_work("a_release_work); + spin_lock(&dq_list_lock); list_for_each_entry_safe(dquot, tmp, &inuse_list, dq_inuse) { if (dquot->dq_sb != sb) @@ -555,6 +581,12 @@ restart: continue; /* Wait for dquot users */ if (atomic_read(&dquot->dq_count)) { + /* dquot in releasing_dquots, flush and retry */ + if (!list_empty(&dquot->dq_free)) { + spin_unlock(&dq_list_lock); + goto restart; + } + atomic_inc(&dquot->dq_count); spin_unlock(&dq_list_lock); /* @@ -597,7 +629,7 @@ int dquot_scan_active(struct super_block *sb, spin_lock(&dq_list_lock); list_for_each_entry(dquot, &inuse_list, dq_inuse) { - if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) + if (!dquot_active(dquot)) continue; if (dquot->dq_sb != sb) continue; @@ -612,7 +644,7 @@ int dquot_scan_active(struct super_block *sb, * outstanding call and recheck the DQ_ACTIVE_B after that. */ wait_on_dquot(dquot); - if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { + if (dquot_active(dquot)) { ret = fn(dquot, priv); if (ret < 0) goto out; @@ -628,6 +660,18 @@ out: } EXPORT_SYMBOL(dquot_scan_active); +static inline int dquot_write_dquot(struct dquot *dquot) +{ + int ret = dquot->dq_sb->dq_op->write_dquot(dquot); + if (ret < 0) { + quota_error(dquot->dq_sb, "Can't write quota structure " + "(error %d). Quota may get out of sync!", ret); + /* Clear dirty bit anyway to avoid infinite loop. */ + clear_dquot_dirty(dquot); + } + return ret; +} + /* Write all dquot structures to quota files */ int dquot_writeback_dquots(struct super_block *sb, int type) { @@ -651,23 +695,16 @@ int dquot_writeback_dquots(struct super_block *sb, int type) dquot = list_first_entry(&dirty, struct dquot, dq_dirty); - WARN_ON(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)); + WARN_ON(!dquot_active(dquot)); /* Now we have active dquot from which someone is * holding reference so we can safely just increase * use count */ dqgrab(dquot); spin_unlock(&dq_list_lock); - err = sb->dq_op->write_dquot(dquot); - if (err) { - /* - * Clear dirty bit anyway to avoid infinite - * loop here. - */ - clear_dquot_dirty(dquot); - if (!ret) - ret = err; - } + err = dquot_write_dquot(dquot); + if (err && !ret) + ret = err; dqput(dquot); spin_lock(&dq_list_lock); } @@ -761,12 +798,53 @@ static struct shrinker dqcache_shrinker = { }; /* + * Safely release dquot and put reference to dquot. + */ +static void quota_release_workfn(struct work_struct *work) +{ + struct dquot *dquot; + struct list_head rls_head; + + spin_lock(&dq_list_lock); + /* Exchange the list head to avoid livelock. */ + list_replace_init(&releasing_dquots, &rls_head); + spin_unlock(&dq_list_lock); + +restart: + synchronize_srcu(&dquot_srcu); + spin_lock(&dq_list_lock); + while (!list_empty(&rls_head)) { + dquot = list_first_entry(&rls_head, struct dquot, dq_free); + /* Dquot got used again? */ + if (atomic_read(&dquot->dq_count) > 1) { + remove_free_dquot(dquot); + atomic_dec(&dquot->dq_count); + continue; + } + if (dquot_dirty(dquot)) { + spin_unlock(&dq_list_lock); + /* Commit dquot before releasing */ + dquot_write_dquot(dquot); + goto restart; + } + if (dquot_active(dquot)) { + spin_unlock(&dq_list_lock); + dquot->dq_sb->dq_op->release_dquot(dquot); + goto restart; + } + /* Dquot is inactive and clean, now move it to free list */ + remove_free_dquot(dquot); + atomic_dec(&dquot->dq_count); + put_dquot_last(dquot); + } + spin_unlock(&dq_list_lock); +} + +/* * Put reference to dquot */ void dqput(struct dquot *dquot) { - int ret; - if (!dquot) return; #ifdef CONFIG_QUOTA_DEBUG @@ -778,7 +856,7 @@ void dqput(struct dquot *dquot) } #endif dqstats_inc(DQST_DROPS); -we_slept: + spin_lock(&dq_list_lock); if (atomic_read(&dquot->dq_count) > 1) { /* We have more than one user... nothing to do */ @@ -790,35 +868,15 @@ we_slept: spin_unlock(&dq_list_lock); return; } + /* Need to release dquot? */ - if (dquot_dirty(dquot)) { - spin_unlock(&dq_list_lock); - /* Commit dquot before releasing */ - ret = dquot->dq_sb->dq_op->write_dquot(dquot); - if (ret < 0) { - quota_error(dquot->dq_sb, "Can't write quota structure" - " (error %d). Quota may get out of sync!", - ret); - /* - * We clear dirty bit anyway, so that we avoid - * infinite loop here - */ - clear_dquot_dirty(dquot); - } - goto we_slept; - } - if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { - spin_unlock(&dq_list_lock); - dquot->dq_sb->dq_op->release_dquot(dquot); - goto we_slept; - } - atomic_dec(&dquot->dq_count); #ifdef CONFIG_QUOTA_DEBUG /* sanity check */ BUG_ON(!list_empty(&dquot->dq_free)); #endif - put_dquot_last(dquot); + put_releasing_dquots(dquot); spin_unlock(&dq_list_lock); + queue_delayed_work(system_unbound_wq, "a_release_work, 1); } EXPORT_SYMBOL(dqput); @@ -908,7 +966,7 @@ we_slept: * already finished or it will be canceled due to dq_count > 1 test */ wait_on_dquot(dquot); /* Read the dquot / allocate space in quota file */ - if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { + if (!dquot_active(dquot)) { int err; err = sb->dq_op->acquire_dquot(dquot); @@ -1014,59 +1072,7 @@ out: return err; } -/* - * Remove references to dquots from inode and add dquot to list for freeing - * if we have the last reference to dquot - */ -static void remove_inode_dquot_ref(struct inode *inode, int type, - struct list_head *tofree_head) -{ - struct dquot **dquots = i_dquot(inode); - struct dquot *dquot = dquots[type]; - - if (!dquot) - return; - - dquots[type] = NULL; - if (list_empty(&dquot->dq_free)) { - /* - * The inode still has reference to dquot so it can't be in the - * free list - */ - spin_lock(&dq_list_lock); - list_add(&dquot->dq_free, tofree_head); - spin_unlock(&dq_list_lock); - } else { - /* - * Dquot is already in a list to put so we won't drop the last - * reference here. - */ - dqput(dquot); - } -} - -/* - * Free list of dquots - * Dquots are removed from inodes and no new references can be got so we are - * the only ones holding reference - */ -static void put_dquot_list(struct list_head *tofree_head) -{ - struct list_head *act_head; - struct dquot *dquot; - - act_head = tofree_head->next; - while (act_head != tofree_head) { - dquot = list_entry(act_head, struct dquot, dq_free); - act_head = act_head->next; - /* Remove dquot from the list so we won't have problems... */ - list_del_init(&dquot->dq_free); - dqput(dquot); - } -} - -static void remove_dquot_ref(struct super_block *sb, int type, - struct list_head *tofree_head) +static void remove_dquot_ref(struct super_block *sb, int type) { struct inode *inode; #ifdef CONFIG_QUOTA_DEBUG @@ -1083,11 +1089,16 @@ static void remove_dquot_ref(struct super_block *sb, int type, */ spin_lock(&dq_data_lock); if (!IS_NOQUOTA(inode)) { + struct dquot **dquots = i_dquot(inode); + struct dquot *dquot = dquots[type]; + #ifdef CONFIG_QUOTA_DEBUG if (unlikely(inode_get_rsv_space(inode) > 0)) reserved = 1; #endif - remove_inode_dquot_ref(inode, type, tofree_head); + dquots[type] = NULL; + if (dquot) + dqput(dquot); } spin_unlock(&dq_data_lock); } @@ -1104,13 +1115,8 @@ static void remove_dquot_ref(struct super_block *sb, int type, /* Gather all references from inodes and drop them */ static void drop_dquot_ref(struct super_block *sb, int type) { - LIST_HEAD(tofree_head); - - if (sb->dq_op) { - remove_dquot_ref(sb, type, &tofree_head); - synchronize_srcu(&dquot_srcu); - put_dquot_list(&tofree_head); - } + if (sb->dq_op) + remove_dquot_ref(sb, type); } static inline @@ -1425,7 +1431,7 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space) return QUOTA_NL_NOWARN; } -static int dquot_active(const struct inode *inode) +static int inode_quota_active(const struct inode *inode) { struct super_block *sb = inode->i_sb; @@ -1448,7 +1454,7 @@ static int __dquot_initialize(struct inode *inode, int type) qsize_t rsv; int ret = 0; - if (!dquot_active(inode)) + if (!inode_quota_active(inode)) return 0; dquots = i_dquot(inode); @@ -1556,7 +1562,7 @@ bool dquot_initialize_needed(struct inode *inode) struct dquot **dquots; int i; - if (!dquot_active(inode)) + if (!inode_quota_active(inode)) return false; dquots = i_dquot(inode); @@ -1667,7 +1673,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) int reserve = flags & DQUOT_SPACE_RESERVE; struct dquot **dquots; - if (!dquot_active(inode)) { + if (!inode_quota_active(inode)) { if (reserve) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) += number; @@ -1737,7 +1743,7 @@ int dquot_alloc_inode(struct inode *inode) struct dquot_warn warn[MAXQUOTAS]; struct dquot * const *dquots; - if (!dquot_active(inode)) + if (!inode_quota_active(inode)) return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) warn[cnt].w_type = QUOTA_NL_NOWARN; @@ -1780,7 +1786,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) struct dquot **dquots; int cnt, index; - if (!dquot_active(inode)) { + if (!inode_quota_active(inode)) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) -= number; __inode_add_bytes(inode, number); @@ -1822,7 +1828,7 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) struct dquot **dquots; int cnt, index; - if (!dquot_active(inode)) { + if (!inode_quota_active(inode)) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) += number; __inode_sub_bytes(inode, number); @@ -1866,7 +1872,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) struct dquot **dquots; int reserve = flags & DQUOT_SPACE_RESERVE, index; - if (!dquot_active(inode)) { + if (!inode_quota_active(inode)) { if (reserve) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) -= number; @@ -1921,7 +1927,7 @@ void dquot_free_inode(struct inode *inode) struct dquot * const *dquots; int index; - if (!dquot_active(inode)) + if (!inode_quota_active(inode)) return; dquots = i_dquot(inode); @@ -2093,7 +2099,7 @@ int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode, struct super_block *sb = inode->i_sb; int ret; - if (!dquot_active(inode)) + if (!inode_quota_active(inode)) return 0; if (i_uid_needs_update(idmap, iattr, inode)) { @@ -2359,15 +2365,14 @@ int dquot_load_quota_sb(struct super_block *sb, int type, int format_id, struct quota_info *dqopt = sb_dqopt(sb); int error; + lockdep_assert_held_write(&sb->s_umount); + /* Just unsuspend quotas? */ BUG_ON(flags & DQUOT_SUSPENDED); - /* s_umount should be held in exclusive mode */ - if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount))) - up_read(&sb->s_umount); if (!fmt) return -ESRCH; - if (!sb->s_op->quota_write || !sb->s_op->quota_read || + if (!sb->dq_op || !sb->s_qcop || (type == PRJQUOTA && sb->dq_op->get_projid == NULL)) { error = -EINVAL; goto out_fmt; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index fef477c78107..18e8387cab41 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -65,7 +65,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, inode->i_mapping->a_ops = &ram_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); @@ -105,7 +105,7 @@ ramfs_mknod(struct mnt_idmap *idmap, struct inode *dir, d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ error = 0; - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); } return error; } @@ -138,7 +138,7 @@ static int ramfs_symlink(struct mnt_idmap *idmap, struct inode *dir, if (!error) { d_instantiate(dentry, inode); dget(dentry); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); } else iput(inode); } diff --git a/fs/read_write.c b/fs/read_write.c index b07de77ef126..4771701c896b 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -71,7 +71,7 @@ EXPORT_SYMBOL(vfs_setpos); * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek - * @size: max size of this file in file system + * @maxsize: max size of this file in file system * @eof: offset used for SEEK_END position * * This is a variant of generic_file_llseek that allows passing in a custom diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig index 4d22ecfe0fab..0e6fe26458fe 100644 --- a/fs/reiserfs/Kconfig +++ b/fs/reiserfs/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config REISERFS_FS tristate "Reiserfs support (deprecated)" + select BUFFER_HEAD select CRC32 select LEGACY_DIRECT_IO help diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index fefe87e1c099..6c13a8d9a73c 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -2252,8 +2252,9 @@ static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh) return sizeof(struct virtual_node) + max(max_num_of_items * sizeof(struct virtual_item), - sizeof(struct virtual_item) + sizeof(struct direntry_uarea) + - (max_num_of_entries - 1) * sizeof(__u16)); + sizeof(struct virtual_item) + + struct_size_t(struct direntry_uarea, entry_sizes, + max_num_of_entries)); } /* diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 77bd3b27059f..86e55d4bb10d 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1259,9 +1259,8 @@ static void init_inode(struct inode *inode, struct treepath *path) inode->i_size = sd_v1_size(sd); inode->i_atime.tv_sec = sd_v1_atime(sd); inode->i_mtime.tv_sec = sd_v1_mtime(sd); - inode->i_ctime.tv_sec = sd_v1_ctime(sd); + inode_set_ctime(inode, sd_v1_ctime(sd), 0); inode->i_atime.tv_nsec = 0; - inode->i_ctime.tv_nsec = 0; inode->i_mtime.tv_nsec = 0; inode->i_blocks = sd_v1_blocks(sd); @@ -1314,8 +1313,7 @@ static void init_inode(struct inode *inode, struct treepath *path) i_gid_write(inode, sd_v2_gid(sd)); inode->i_mtime.tv_sec = sd_v2_mtime(sd); inode->i_atime.tv_sec = sd_v2_atime(sd); - inode->i_ctime.tv_sec = sd_v2_ctime(sd); - inode->i_ctime.tv_nsec = 0; + inode_set_ctime(inode, sd_v2_ctime(sd), 0); inode->i_mtime.tv_nsec = 0; inode->i_atime.tv_nsec = 0; inode->i_blocks = sd_v2_blocks(sd); @@ -1374,7 +1372,7 @@ static void inode2sd(void *sd, struct inode *inode, loff_t size) set_sd_v2_gid(sd_v2, i_gid_read(inode)); set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec); set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec); - set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec); + set_sd_v2_ctime(sd_v2, inode_get_ctime(inode).tv_sec); set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE)); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev)); @@ -1394,7 +1392,7 @@ static void inode2sd_v1(void *sd, struct inode *inode, loff_t size) set_sd_v1_nlink(sd_v1, inode->i_nlink); set_sd_v1_size(sd_v1, size); set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec); - set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec); + set_sd_v1_ctime(sd_v1, inode_get_ctime(inode).tv_sec); set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) @@ -1986,7 +1984,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, /* uid and gid must already be set by the caller for quota init */ - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_size = i_size; inode->i_blocks = 0; inode->i_bytes = 0; diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 6bf9b54e58ca..dd33f8cc6eda 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -55,7 +55,7 @@ int reiserfs_fileattr_set(struct mnt_idmap *idmap, } sd_attrs_to_i_attrs(flags, inode); REISERFS_I(inode)->i_attrs = flags; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); err = 0; unlock: @@ -107,7 +107,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) err = -EFAULT; goto setversion_out; } - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); setversion_out: mnt_drop_write_file(filp); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 479aa4a57602..015bfe4e4524 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2326,7 +2326,7 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev, int i, j; bh = __getblk(dev, block, bufsize); - if (buffer_uptodate(bh)) + if (!bh || buffer_uptodate(bh)) return (bh); if (block + BUFNR > max_block) { @@ -2336,6 +2336,8 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev, j = 1; for (i = 1; i < blocks; i++) { bh = __getblk(dev, block + i, bufsize); + if (!bh) + break; if (buffer_uptodate(bh)) { brelse(bh); break; diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 52240cc891cf..9c5704be2435 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -572,7 +572,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, } dir->i_size += paste_size; - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); if (!S_ISDIR(inode->i_mode) && visible) /* reiserfs_mkdir or reiserfs_rename will do that by itself */ reiserfs_update_sd(th, dir); @@ -966,7 +966,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) inode->i_nlink); clear_nlink(inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_to_ts(dir, + inode_set_ctime_current(inode)); reiserfs_update_sd(&th, inode); DEC_DIR_INODE_NLINK(dir) @@ -1070,11 +1071,11 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) inc_nlink(inode); goto end_unlink; } - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); reiserfs_update_sd(&th, inode); dir->i_size -= (de.de_entrylen + DEH_SIZE); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); reiserfs_update_sd(&th, dir); if (!savelink) @@ -1250,7 +1251,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, return err ? err : retval; } - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); reiserfs_update_sd(&th, inode); ihold(inode); @@ -1325,7 +1326,6 @@ static int reiserfs_rename(struct mnt_idmap *idmap, int jbegin_count; umode_t old_inode_mode; unsigned long savelink = 1; - struct timespec64 ctime; if (flags & ~RENAME_NOREPLACE) return -EINVAL; @@ -1576,14 +1576,11 @@ static int reiserfs_rename(struct mnt_idmap *idmap, mark_de_hidden(old_de.de_deh + old_de.de_entry_num); journal_mark_dirty(&th, old_de.de_bh); - ctime = current_time(old_dir); - old_dir->i_ctime = old_dir->i_mtime = ctime; - new_dir->i_ctime = new_dir->i_mtime = ctime; /* * thanks to Alex Adriaanse <alex_a@caltech.edu> for patch * which adds ctime update of renamed object */ - old_inode->i_ctime = ctime; + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); if (new_dentry_inode) { /* adjust link number of the victim */ @@ -1592,7 +1589,6 @@ static int reiserfs_rename(struct mnt_idmap *idmap, } else { drop_nlink(new_dentry_inode); } - new_dentry_inode->i_ctime = ctime; savelink = new_dentry_inode->i_nlink; } diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 55e85256aae8..7d12b8c5b2fa 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -2373,7 +2373,7 @@ struct virtual_node { struct direntry_uarea { int flags; __u16 entry_count; - __u16 entry_sizes[1]; + __u16 entry_sizes[]; } __attribute__ ((__packed__)); /*************************************************************************** @@ -2699,7 +2699,7 @@ struct reiserfs_iget_args { #define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12) #define journal_trans_half(blocksize) \ - ((blocksize - sizeof (struct reiserfs_journal_desc) + sizeof (__u32) - 12) / sizeof (__u32)) + ((blocksize - sizeof(struct reiserfs_journal_desc) - 12) / sizeof(__u32)) /* journal.c see journal.c for all the comments here */ @@ -2711,7 +2711,7 @@ struct reiserfs_journal_desc { __le32 j_len; __le32 j_mount_id; /* mount id of this trans */ - __le32 j_realblock[1]; /* real locations for each block */ + __le32 j_realblock[]; /* real locations for each block */ }; #define get_desc_trans_id(d) le32_to_cpu((d)->j_trans_id) @@ -2726,7 +2726,7 @@ struct reiserfs_journal_desc { struct reiserfs_journal_commit { __le32 j_trans_id; /* must match j_trans_id from the desc block */ __le32 j_len; /* ditto */ - __le32 j_realblock[1]; /* real locations for each block */ + __le32 j_realblock[]; /* real locations for each block */ }; #define get_commit_trans_id(c) le32_to_cpu((c)->j_trans_id) diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index ce5003986789..3676e02a0232 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -2004,7 +2004,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, if (update_timestamps) { inode->i_mtime = current_time(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); } reiserfs_update_sd(th, inode); @@ -2029,7 +2029,7 @@ update_and_out: if (update_timestamps) { /* this is truncate, not file closing */ inode->i_mtime = current_time(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); } reiserfs_update_sd(th, inode); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 929acce6e731..7eaf36b3de12 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2587,7 +2587,7 @@ out: return err; if (inode->i_size < off + len - towrite) i_size_write(inode, off + len - towrite); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); return len - towrite; } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 651027967159..6000964c2b80 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -466,12 +466,13 @@ int reiserfs_commit_write(struct file *f, struct page *page, static void update_ctime(struct inode *inode) { struct timespec64 now = current_time(inode); + struct timespec64 ctime = inode_get_ctime(inode); if (inode_unhashed(inode) || !inode->i_nlink || - timespec64_equal(&inode->i_ctime, &now)) + timespec64_equal(&ctime, &now)) return; - inode->i_ctime = current_time(inode); + inode_set_ctime_to_ts(inode, now); mark_inode_dirty(inode); } diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 138060452678..064264992b49 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -285,7 +285,7 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, if (error == -ENODATA) { error = 0; if (type == ACL_TYPE_ACCESS) { - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); } } diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig index 8eb87008b55a..f24a96a331af 100644 --- a/fs/romfs/Kconfig +++ b/fs/romfs/Kconfig @@ -57,6 +57,7 @@ endchoice config ROMFS_ON_BLOCK bool default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH + select BUFFER_HEAD config ROMFS_ON_MTD bool diff --git a/fs/romfs/super.c b/fs/romfs/super.c index c59b230d55b4..5c35f6c76037 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -322,8 +322,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) set_nlink(i, 1); /* Hard to decide.. */ i->i_size = be32_to_cpu(ri.size); - i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0; - i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; + i->i_mtime = i->i_atime = inode_set_ctime(i, 0, 0); /* set up mode and ops */ mode = romfs_modemap[nextfh & ROMFH_TYPE]; @@ -583,16 +582,18 @@ static int romfs_init_fs_context(struct fs_context *fc) */ static void romfs_kill_sb(struct super_block *sb) { + generic_shutdown_super(sb); + #ifdef CONFIG_ROMFS_ON_MTD if (sb->s_mtd) { - kill_mtd_super(sb); - return; + put_mtd_device(sb->s_mtd); + sb->s_mtd = NULL; } #endif #ifdef CONFIG_ROMFS_ON_BLOCK if (sb->s_bdev) { - kill_block_super(sb); - return; + sync_blockdev(sb->s_bdev); + blkdev_put(sb->s_bdev, sb); } #endif } diff --git a/fs/smb/client/Kconfig b/fs/smb/client/Kconfig index 4c0d53bf931a..2927bd174a88 100644 --- a/fs/smb/client/Kconfig +++ b/fs/smb/client/Kconfig @@ -3,6 +3,7 @@ config CIFS tristate "SMB3 and CIFS support (advanced network filesystem)" depends on INET select NLS + select NLS_UCS2_UTILS select CRYPTO select CRYPTO_MD5 select CRYPTO_SHA256 diff --git a/fs/smb/client/Makefile b/fs/smb/client/Makefile index 304a7f6cc13a..0b07eb94c93b 100644 --- a/fs/smb/client/Makefile +++ b/fs/smb/client/Makefile @@ -11,7 +11,8 @@ cifs-y := trace.o cifsfs.o cifs_debug.o connect.o dir.o file.o \ readdir.o ioctl.o sess.o export.o unc.o winucase.o \ smb2ops.o smb2maperror.o smb2transport.o \ smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o \ - dns_resolve.o cifs_spnego_negtokeninit.asn1.o asn1.o + dns_resolve.o cifs_spnego_negtokeninit.asn1.o asn1.o \ + namespace.o $(obj)/asn1.o: $(obj)/cifs_spnego_negtokeninit.asn1.h @@ -21,7 +22,7 @@ cifs-$(CONFIG_CIFS_XATTR) += xattr.o cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o -cifs-$(CONFIG_CIFS_DFS_UPCALL) += cifs_dfs_ref.o dfs_cache.o dfs.o +cifs-$(CONFIG_CIFS_DFS_UPCALL) += dfs_cache.o dfs.o cifs-$(CONFIG_CIFS_SWN_UPCALL) += netlink.o cifs_swn.o diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index fe483f163dbc..e2be8aedb26e 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -18,7 +18,8 @@ static void smb2_close_cached_fid(struct kref *ref); static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, const char *path, - bool lookup_only) + bool lookup_only, + __u32 max_cached_dirs) { struct cached_fid *cfid; @@ -43,7 +44,7 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, spin_unlock(&cfids->cfid_list_lock); return NULL; } - if (cfids->num_entries >= MAX_CACHED_FIDS) { + if (cfids->num_entries >= max_cached_dirs) { spin_unlock(&cfids->cfid_list_lock); return NULL; } @@ -145,7 +146,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, const char *npath; if (tcon == NULL || tcon->cfids == NULL || tcon->nohandlecache || - is_smb1_server(tcon->ses->server)) + is_smb1_server(tcon->ses->server) || (dir_cache_timeout == 0)) return -EOPNOTSUPP; ses = tcon->ses; @@ -162,7 +163,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, if (!utf16_path) return -ENOMEM; - cfid = find_or_create_cached_dir(cfids, path, lookup_only); + cfid = find_or_create_cached_dir(cfids, path, lookup_only, tcon->max_cached_dirs); if (cfid == NULL) { kfree(utf16_path); return -ENOENT; @@ -218,7 +219,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, .tcon = tcon, .path = path, .create_options = cifs_create_options(cifs_sb, CREATE_NOT_FILE), - .desired_access = FILE_READ_ATTRIBUTES, + .desired_access = FILE_READ_DATA | FILE_READ_ATTRIBUTES, .disposition = FILE_OPEN, .fid = pfid, }; @@ -451,6 +452,9 @@ void invalidate_all_cached_dirs(struct cifs_tcon *tcon) struct cached_fid *cfid, *q; LIST_HEAD(entry); + if (cfids == NULL) + return; + spin_lock(&cfids->cfid_list_lock); list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { list_move(&cfid->entry, &entry); @@ -582,7 +586,7 @@ cifs_cfids_laundromat_thread(void *p) return 0; spin_lock(&cfids->cfid_list_lock); list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { - if (time_after(jiffies, cfid->time + HZ * 30)) { + if (time_after(jiffies, cfid->time + HZ * dir_cache_timeout)) { list_del(&cfid->entry); list_add(&cfid->entry, &entry); cfids->num_entries--; @@ -650,6 +654,9 @@ void free_cached_dirs(struct cached_fids *cfids) struct cached_fid *cfid, *q; LIST_HEAD(entry); + if (cfids == NULL) + return; + if (cfids->laundromat) { kthread_stop(cfids->laundromat); cfids->laundromat = NULL; diff --git a/fs/smb/client/cached_dir.h b/fs/smb/client/cached_dir.h index facc9b154d00..a82ff2cea789 100644 --- a/fs/smb/client/cached_dir.h +++ b/fs/smb/client/cached_dir.h @@ -49,7 +49,7 @@ struct cached_fid { struct cached_dirents dirents; }; -#define MAX_CACHED_FIDS 16 +/* default MAX_CACHED_FIDS is 16 */ struct cached_fids { /* Must be held when: * - accessing the cfids->entries list diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index aec6e9137474..76922fcc4bc6 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -331,7 +331,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { /* channel info will be printed as a part of sessions below */ - if (CIFS_SERVER_IS_CHAN(server)) + if (SERVER_IS_CHAN(server)) continue; c++; diff --git a/fs/smb/client/cifs_unicode.c b/fs/smb/client/cifs_unicode.c index e7582dd79179..79d99a913944 100644 --- a/fs/smb/client/cifs_unicode.c +++ b/fs/smb/client/cifs_unicode.c @@ -8,7 +8,6 @@ #include <linux/slab.h> #include "cifs_fs_sb.h" #include "cifs_unicode.h" -#include "cifs_uniupr.h" #include "cifspdu.h" #include "cifsglob.h" #include "cifs_debug.h" diff --git a/fs/smb/client/cifs_unicode.h b/fs/smb/client/cifs_unicode.h index 80b3d845419f..e137a0dfbbe9 100644 --- a/fs/smb/client/cifs_unicode.h +++ b/fs/smb/client/cifs_unicode.h @@ -21,21 +21,7 @@ #include <asm/byteorder.h> #include <linux/types.h> #include <linux/nls.h> - -#define UNIUPR_NOLOWER /* Example to not expand lower case tables */ - -/* - * Windows maps these to the user defined 16 bit Unicode range since they are - * reserved symbols (along with \ and /), otherwise illegal to store - * in filenames in NTFS - */ -#define UNI_ASTERISK (__u16) ('*' + 0xF000) -#define UNI_QUESTION (__u16) ('?' + 0xF000) -#define UNI_COLON (__u16) (':' + 0xF000) -#define UNI_GRTRTHAN (__u16) ('>' + 0xF000) -#define UNI_LESSTHAN (__u16) ('<' + 0xF000) -#define UNI_PIPE (__u16) ('|' + 0xF000) -#define UNI_SLASH (__u16) ('\\' + 0xF000) +#include "../../nls/nls_ucs2_utils.h" /* * Macs use an older "SFM" mapping of the symbols above. Fortunately it does @@ -68,27 +54,6 @@ #define SFM_MAP_UNI_RSVD 1 #define SFU_MAP_UNI_RSVD 2 -/* Just define what we want from uniupr.h. We don't want to define the tables - * in each source file. - */ -#ifndef UNICASERANGE_DEFINED -struct UniCaseRange { - wchar_t start; - wchar_t end; - signed char *table; -}; -#endif /* UNICASERANGE_DEFINED */ - -#ifndef UNIUPR_NOUPPER -extern signed char CifsUniUpperTable[512]; -extern const struct UniCaseRange CifsUniUpperRange[]; -#endif /* UNIUPR_NOUPPER */ - -#ifndef UNIUPR_NOLOWER -extern signed char CifsUniLowerTable[512]; -extern const struct UniCaseRange CifsUniLowerRange[]; -#endif /* UNIUPR_NOLOWER */ - #ifdef __KERNEL__ int cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, const struct nls_table *cp, int map_type); @@ -108,297 +73,4 @@ extern __le16 *cifs_strndup_to_utf16(const char *src, const int maxlen, wchar_t cifs_toupper(wchar_t in); -/* - * UniStrcat: Concatenate the second string to the first - * - * Returns: - * Address of the first string - */ -static inline __le16 * -UniStrcat(__le16 *ucs1, const __le16 *ucs2) -{ - __le16 *anchor = ucs1; /* save a pointer to start of ucs1 */ - - while (*ucs1++) ; /* To end of first string */ - ucs1--; /* Return to the null */ - while ((*ucs1++ = *ucs2++)) ; /* copy string 2 over */ - return anchor; -} - -/* - * UniStrchr: Find a character in a string - * - * Returns: - * Address of first occurrence of character in string - * or NULL if the character is not in the string - */ -static inline wchar_t * -UniStrchr(const wchar_t *ucs, wchar_t uc) -{ - while ((*ucs != uc) && *ucs) - ucs++; - - if (*ucs == uc) - return (wchar_t *) ucs; - return NULL; -} - -/* - * UniStrcmp: Compare two strings - * - * Returns: - * < 0: First string is less than second - * = 0: Strings are equal - * > 0: First string is greater than second - */ -static inline int -UniStrcmp(const wchar_t *ucs1, const wchar_t *ucs2) -{ - while ((*ucs1 == *ucs2) && *ucs1) { - ucs1++; - ucs2++; - } - return (int) *ucs1 - (int) *ucs2; -} - -/* - * UniStrcpy: Copy a string - */ -static inline wchar_t * -UniStrcpy(wchar_t *ucs1, const wchar_t *ucs2) -{ - wchar_t *anchor = ucs1; /* save the start of result string */ - - while ((*ucs1++ = *ucs2++)) ; - return anchor; -} - -/* - * UniStrlen: Return the length of a string (in 16 bit Unicode chars not bytes) - */ -static inline size_t -UniStrlen(const wchar_t *ucs1) -{ - int i = 0; - - while (*ucs1++) - i++; - return i; -} - -/* - * UniStrnlen: Return the length (in 16 bit Unicode chars not bytes) of a - * string (length limited) - */ -static inline size_t -UniStrnlen(const wchar_t *ucs1, int maxlen) -{ - int i = 0; - - while (*ucs1++) { - i++; - if (i >= maxlen) - break; - } - return i; -} - -/* - * UniStrncat: Concatenate length limited string - */ -static inline wchar_t * -UniStrncat(wchar_t *ucs1, const wchar_t *ucs2, size_t n) -{ - wchar_t *anchor = ucs1; /* save pointer to string 1 */ - - while (*ucs1++) ; - ucs1--; /* point to null terminator of s1 */ - while (n-- && (*ucs1 = *ucs2)) { /* copy s2 after s1 */ - ucs1++; - ucs2++; - } - *ucs1 = 0; /* Null terminate the result */ - return (anchor); -} - -/* - * UniStrncmp: Compare length limited string - */ -static inline int -UniStrncmp(const wchar_t *ucs1, const wchar_t *ucs2, size_t n) -{ - if (!n) - return 0; /* Null strings are equal */ - while ((*ucs1 == *ucs2) && *ucs1 && --n) { - ucs1++; - ucs2++; - } - return (int) *ucs1 - (int) *ucs2; -} - -/* - * UniStrncmp_le: Compare length limited string - native to little-endian - */ -static inline int -UniStrncmp_le(const wchar_t *ucs1, const wchar_t *ucs2, size_t n) -{ - if (!n) - return 0; /* Null strings are equal */ - while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) { - ucs1++; - ucs2++; - } - return (int) *ucs1 - (int) __le16_to_cpu(*ucs2); -} - -/* - * UniStrncpy: Copy length limited string with pad - */ -static inline wchar_t * -UniStrncpy(wchar_t *ucs1, const wchar_t *ucs2, size_t n) -{ - wchar_t *anchor = ucs1; - - while (n-- && *ucs2) /* Copy the strings */ - *ucs1++ = *ucs2++; - - n++; - while (n--) /* Pad with nulls */ - *ucs1++ = 0; - return anchor; -} - -/* - * UniStrncpy_le: Copy length limited string with pad to little-endian - */ -static inline wchar_t * -UniStrncpy_le(wchar_t *ucs1, const wchar_t *ucs2, size_t n) -{ - wchar_t *anchor = ucs1; - - while (n-- && *ucs2) /* Copy the strings */ - *ucs1++ = __le16_to_cpu(*ucs2++); - - n++; - while (n--) /* Pad with nulls */ - *ucs1++ = 0; - return anchor; -} - -/* - * UniStrstr: Find a string in a string - * - * Returns: - * Address of first match found - * NULL if no matching string is found - */ -static inline wchar_t * -UniStrstr(const wchar_t *ucs1, const wchar_t *ucs2) -{ - const wchar_t *anchor1 = ucs1; - const wchar_t *anchor2 = ucs2; - - while (*ucs1) { - if (*ucs1 == *ucs2) { - /* Partial match found */ - ucs1++; - ucs2++; - } else { - if (!*ucs2) /* Match found */ - return (wchar_t *) anchor1; - ucs1 = ++anchor1; /* No match */ - ucs2 = anchor2; - } - } - - if (!*ucs2) /* Both end together */ - return (wchar_t *) anchor1; /* Match found */ - return NULL; /* No match */ -} - -#ifndef UNIUPR_NOUPPER -/* - * UniToupper: Convert a unicode character to upper case - */ -static inline wchar_t -UniToupper(register wchar_t uc) -{ - register const struct UniCaseRange *rp; - - if (uc < sizeof(CifsUniUpperTable)) { - /* Latin characters */ - return uc + CifsUniUpperTable[uc]; /* Use base tables */ - } else { - rp = CifsUniUpperRange; /* Use range tables */ - while (rp->start) { - if (uc < rp->start) /* Before start of range */ - return uc; /* Uppercase = input */ - if (uc <= rp->end) /* In range */ - return uc + rp->table[uc - rp->start]; - rp++; /* Try next range */ - } - } - return uc; /* Past last range */ -} - -/* - * UniStrupr: Upper case a unicode string - */ -static inline __le16 * -UniStrupr(register __le16 *upin) -{ - register __le16 *up; - - up = upin; - while (*up) { /* For all characters */ - *up = cpu_to_le16(UniToupper(le16_to_cpu(*up))); - up++; - } - return upin; /* Return input pointer */ -} -#endif /* UNIUPR_NOUPPER */ - -#ifndef UNIUPR_NOLOWER -/* - * UniTolower: Convert a unicode character to lower case - */ -static inline wchar_t -UniTolower(register wchar_t uc) -{ - register const struct UniCaseRange *rp; - - if (uc < sizeof(CifsUniLowerTable)) { - /* Latin characters */ - return uc + CifsUniLowerTable[uc]; /* Use base tables */ - } else { - rp = CifsUniLowerRange; /* Use range tables */ - while (rp->start) { - if (uc < rp->start) /* Before start of range */ - return uc; /* Uppercase = input */ - if (uc <= rp->end) /* In range */ - return uc + rp->table[uc - rp->start]; - rp++; /* Try next range */ - } - } - return uc; /* Past last range */ -} - -/* - * UniStrlwr: Lower case a unicode string - */ -static inline wchar_t * -UniStrlwr(register wchar_t *upin) -{ - register wchar_t *up; - - up = upin; - while (*up) { /* For all characters */ - *up = UniTolower(*up); - up++; - } - return upin; /* Return input pointer */ -} - -#endif - #endif /* _CIFS_UNICODE_H */ diff --git a/fs/smb/client/cifs_uniupr.h b/fs/smb/client/cifs_uniupr.h deleted file mode 100644 index 7b272fcdf0d3..000000000000 --- a/fs/smb/client/cifs_uniupr.h +++ /dev/null @@ -1,239 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (c) International Business Machines Corp., 2000,2002 - * - * uniupr.h - Unicode compressed case ranges -*/ - -#ifndef UNIUPR_NOUPPER -/* - * Latin upper case - */ -signed char CifsUniUpperTable[512] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 040-04f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 050-05f */ - 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 060-06f */ - -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0, /* 070-07f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0c0-0cf */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0d0-0df */ - -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 0e0-0ef */ - -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121, /* 0f0-0ff */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 100-10f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 110-11f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 120-12f */ - 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 130-13f */ - -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, /* 140-14f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 150-15f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 160-16f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 170-17f */ - 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, /* 180-18f */ - 0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, /* 190-19f */ - 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, /* 1a0-1af */ - -1, 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, /* 1b0-1bf */ - 0, 0, 0, 0, 0, -1, -2, 0, -1, -2, 0, -1, -2, 0, -1, 0, /* 1c0-1cf */ - -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, -79, 0, -1, /* 1d0-1df */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e0-1ef */ - 0, 0, -1, -2, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, /* 1f0-1ff */ -}; - -/* Upper case range - Greek */ -static signed char UniCaseRangeU03a0[47] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -38, -37, -37, -37, /* 3a0-3af */ - 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 3b0-3bf */ - -32, -32, -31, -32, -32, -32, -32, -32, -32, -32, -32, -32, -64, - -63, -63, -}; - -/* Upper case range - Cyrillic */ -static signed char UniCaseRangeU0430[48] = { - -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 430-43f */ - -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 440-44f */ - 0, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, 0, -80, -80, /* 450-45f */ -}; - -/* Upper case range - Extended cyrillic */ -static signed char UniCaseRangeU0490[61] = { - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 490-49f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4a0-4af */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4b0-4bf */ - 0, 0, -1, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, -}; - -/* Upper case range - Extended latin and greek */ -static signed char UniCaseRangeU1e00[509] = { - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e00-1e0f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e10-1e1f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e20-1e2f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e30-1e3f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e40-1e4f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e50-1e5f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e60-1e6f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e70-1e7f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e80-1e8f */ - 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, -59, 0, -1, 0, -1, /* 1e90-1e9f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ea0-1eaf */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1eb0-1ebf */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ec0-1ecf */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ed0-1edf */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ee0-1eef */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f00-1f0f */ - 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f10-1f1f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f20-1f2f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f30-1f3f */ - 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f40-1f4f */ - 0, 8, 0, 8, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f50-1f5f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f60-1f6f */ - 74, 74, 86, 86, 86, 86, 100, 100, 0, 0, 112, 112, 126, 126, 0, 0, /* 1f70-1f7f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f80-1f8f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f90-1f9f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fa0-1faf */ - 8, 8, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fb0-1fbf */ - 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fc0-1fcf */ - 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fd0-1fdf */ - 8, 8, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fe0-1fef */ - 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, -}; - -/* Upper case range - Wide latin */ -static signed char UniCaseRangeUff40[27] = { - 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* ff40-ff4f */ - -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -}; - -/* - * Upper Case Range - */ -const struct UniCaseRange CifsUniUpperRange[] = { - {0x03a0, 0x03ce, UniCaseRangeU03a0}, - {0x0430, 0x045f, UniCaseRangeU0430}, - {0x0490, 0x04cc, UniCaseRangeU0490}, - {0x1e00, 0x1ffc, UniCaseRangeU1e00}, - {0xff40, 0xff5a, UniCaseRangeUff40}, - {0} -}; -#endif - -#ifndef UNIUPR_NOLOWER -/* - * Latin lower case - */ -signed char CifsUniLowerTable[512] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */ - 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 040-04f */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0, /* 050-05f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 060-06f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 070-07f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 0c0-0cf */ - 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, 0, /* 0d0-0df */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0e0-0ef */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0f0-0ff */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 100-10f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 110-11f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 120-12f */ - 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, /* 130-13f */ - 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, /* 140-14f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 150-15f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 160-16f */ - 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, 0, /* 170-17f */ - 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 79, 0, /* 180-18f */ - 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, /* 190-19f */ - 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, /* 1a0-1af */ - 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, /* 1b0-1bf */ - 0, 0, 0, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 1, 0, 1, /* 1c0-1cf */ - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, /* 1d0-1df */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e0-1ef */ - 0, 2, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1f0-1ff */ -}; - -/* Lower case range - Greek */ -static signed char UniCaseRangeL0380[44] = { - 0, 0, 0, 0, 0, 0, 38, 0, 37, 37, 37, 0, 64, 0, 63, 63, /* 380-38f */ - 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 390-39f */ - 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, -}; - -/* Lower case range - Cyrillic */ -static signed char UniCaseRangeL0400[48] = { - 0, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 0, 80, 80, /* 400-40f */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 410-41f */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 420-42f */ -}; - -/* Lower case range - Extended cyrillic */ -static signed char UniCaseRangeL0490[60] = { - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 490-49f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 4a0-4af */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 4b0-4bf */ - 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, -}; - -/* Lower case range - Extended latin and greek */ -static signed char UniCaseRangeL1e00[504] = { - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e00-1e0f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e10-1e1f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e20-1e2f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e30-1e3f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e40-1e4f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e50-1e5f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e60-1e6f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e70-1e7f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e80-1e8f */ - 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, /* 1e90-1e9f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ea0-1eaf */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1eb0-1ebf */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ec0-1ecf */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ed0-1edf */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ee0-1eef */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f00-1f0f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0, /* 1f10-1f1f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f20-1f2f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f30-1f3f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0, /* 1f40-1f4f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, -8, 0, -8, 0, -8, 0, -8, /* 1f50-1f5f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f60-1f6f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f70-1f7f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f80-1f8f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f90-1f9f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1fa0-1faf */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -74, -74, -9, 0, 0, 0, /* 1fb0-1fbf */ - 0, 0, 0, 0, 0, 0, 0, 0, -86, -86, -86, -86, -9, 0, 0, 0, /* 1fc0-1fcf */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -100, -100, 0, 0, 0, 0, /* 1fd0-1fdf */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -112, -112, -7, 0, 0, 0, /* 1fe0-1fef */ - 0, 0, 0, 0, 0, 0, 0, 0, -}; - -/* Lower case range - Wide latin */ -static signed char UniCaseRangeLff20[27] = { - 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* ff20-ff2f */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, -}; - -/* - * Lower Case Range - */ -const struct UniCaseRange CifsUniLowerRange[] = { - {0x0380, 0x03ab, UniCaseRangeL0380}, - {0x0400, 0x042f, UniCaseRangeL0400}, - {0x0490, 0x04cb, UniCaseRangeL0490}, - {0x1e00, 0x1ff7, UniCaseRangeL1e00}, - {0xff20, 0xff3a, UniCaseRangeLff20}, - {0} -}; -#endif diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index a4d8b0ea1c8c..22869cda1356 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -117,6 +117,10 @@ module_param(cifs_max_pending, uint, 0444); MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server for " "CIFS/SMB1 dialect (N/A for SMB3) " "Default: 32767 Range: 2 to 32767."); +unsigned int dir_cache_timeout = 30; +module_param(dir_cache_timeout, uint, 0644); +MODULE_PARM_DESC(dir_cache_timeout, "Number of seconds to cache directory contents for which we have a lease. Default: 30 " + "Range: 1 to 65000 seconds, 0 to disable caching dir contents"); #ifdef CONFIG_CIFS_STATS2 unsigned int slow_rsp_threshold = 1; module_param(slow_rsp_threshold, uint, 0644); @@ -695,6 +699,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",snapshot=%llu", tcon->snapshot_time); if (tcon->handle_timeout) seq_printf(s, ",handletimeout=%u", tcon->handle_timeout); + if (tcon->max_cached_dirs != MAX_CACHED_FIDS) + seq_printf(s, ",max_cached_dirs=%u", tcon->max_cached_dirs); /* * Display file and directory attribute timeout in seconds. @@ -1077,7 +1083,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence) } static int -cifs_setlease(struct file *file, long arg, struct file_lock **lease, void **priv) +cifs_setlease(struct file *file, int arg, struct file_lock **lease, void **priv) { /* * Note that this is called by vfs setlease with i_lock held to @@ -1679,6 +1685,12 @@ init_cifs(void) CIFS_MAX_REQ); } + /* Limit max to about 18 hours, and setting to zero disables directory entry caching */ + if (dir_cache_timeout > 65000) { + dir_cache_timeout = 65000; + cifs_dbg(VFS, "dir_cache_timeout set to max of 65000 seconds\n"); + } + cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); if (!cifsiod_wq) { rc = -ENOMEM; @@ -1805,7 +1817,7 @@ exit_cifs(void) cifs_dbg(NOISY, "exit_smb3\n"); unregister_filesystem(&cifs_fs_type); unregister_filesystem(&smb3_fs_type); - cifs_dfs_release_automount_timer(); + cifs_release_automount_timer(); exit_cifs_idmap(); #ifdef CONFIG_CIFS_SWN_UPCALL cifs_genl_exit(); diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h index 15c8cc4b6680..41daebd220ff 100644 --- a/fs/smb/client/cifsfs.h +++ b/fs/smb/client/cifsfs.h @@ -81,7 +81,7 @@ extern int cifs_fiemap(struct inode *, struct fiemap_extent_info *, u64 start, extern const struct inode_operations cifs_file_inode_ops; extern const struct inode_operations cifs_symlink_inode_ops; -extern const struct inode_operations cifs_dfs_referral_inode_operations; +extern const struct inode_operations cifs_namespace_inode_operations; /* Functions related to files and directories */ @@ -118,14 +118,7 @@ extern void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned extern const struct dentry_operations cifs_dentry_ops; extern const struct dentry_operations cifs_ci_dentry_ops; -#ifdef CONFIG_CIFS_DFS_UPCALL -extern struct vfsmount *cifs_dfs_d_automount(struct path *path); -#else -static inline struct vfsmount *cifs_dfs_d_automount(struct path *path) -{ - return ERR_PTR(-EREMOTE); -} -#endif +extern struct vfsmount *cifs_d_automount(struct path *path); /* Functions related to symlinks */ extern const char *cifs_get_link(struct dentry *, struct inode *, @@ -159,6 +152,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 44 -#define CIFS_VERSION "2.44" +#define SMB3_PRODUCT_BUILD 45 +#define CIFS_VERSION "2.45" #endif /* _CIFSFS_H */ diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 657dee4b2c8c..02082621d8e0 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -186,6 +186,12 @@ struct cifs_cred { }; struct cifs_open_info_data { + bool adjust_tz; + union { + bool reparse_point; + bool symlink; + }; + __u32 reparse_tag; char *symlink_target; union { struct smb2_file_all_info fi; @@ -193,6 +199,10 @@ struct cifs_open_info_data { }; }; +#define cifs_open_data_reparse(d) \ + ((d)->reparse_point || \ + (le32_to_cpu((d)->fi.Attributes) & ATTR_REPARSE)) + static inline void cifs_free_open_info(struct cifs_open_info_data *data) { kfree(data->symlink_target); @@ -318,16 +328,21 @@ struct smb_version_operations { int (*is_path_accessible)(const unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const char *); /* query path data from the server */ - int (*query_path_info)(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse); + int (*query_path_info)(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + struct cifs_open_info_data *data); /* query file data from the server */ int (*query_file_info)(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *cfile, struct cifs_open_info_data *data); - /* query reparse tag from srv to determine which type of special file */ - int (*query_reparse_tag)(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *path, - __u32 *reparse_tag); + /* query reparse point to determine which type of special file */ + int (*query_reparse_point)(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + u32 *tag, struct kvec *rsp, + int *rsp_buftype); /* get server index number */ int (*get_srv_inum)(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, u64 *uniqueid, @@ -376,9 +391,12 @@ struct smb_version_operations { const char *, const char *, struct cifs_sb_info *); /* query symlink target */ - int (*query_symlink)(const unsigned int, struct cifs_tcon *, - struct cifs_sb_info *, const char *, - char **, bool); + int (*query_symlink)(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + char **target_path, + struct kvec *rsp_iov); /* open a file for non-posix mounts */ int (*open)(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, void *buf); @@ -727,8 +745,9 @@ struct TCP_Server_Info { * primary_server holds the ref-counted * pointer to primary channel connection for the session. */ -#define CIFS_SERVER_IS_CHAN(server) (!!(server)->primary_server) +#define SERVER_IS_CHAN(server) (!!(server)->primary_server) struct TCP_Server_Info *primary_server; + __u16 channel_sequence_num; /* incremented on primary channel on each chan reconnect */ #ifdef CONFIG_CIFS_SWN_UPCALL bool use_swn_dstaddr; @@ -1076,7 +1095,7 @@ cap_unix(struct cifs_ses *ses) * inode with new info */ -#define CIFS_FATTR_DFS_REFERRAL 0x1 +#define CIFS_FATTR_JUNCTION 0x1 #define CIFS_FATTR_DELETE_PENDING 0x2 #define CIFS_FATTR_NEED_REVAL 0x4 #define CIFS_FATTR_INO_COLLISION 0x8 @@ -1191,6 +1210,7 @@ struct cifs_tcon { __u32 max_chunks; __u32 max_bytes_chunk; __u32 max_bytes_copy; + __u32 max_cached_dirs; #ifdef CONFIG_CIFS_FSCACHE u64 resource_id; /* server resource id */ struct fscache_volume *fscache; /* cookie for share */ @@ -1721,11 +1741,23 @@ struct cifs_mount_ctx { struct list_head dfs_ses_list; }; +static inline void __free_dfs_info_param(struct dfs_info3_param *param) +{ + kfree(param->path_name); + kfree(param->node_name); +} + static inline void free_dfs_info_param(struct dfs_info3_param *param) { + if (param) + __free_dfs_info_param(param); +} + +static inline void zfree_dfs_info_param(struct dfs_info3_param *param) +{ if (param) { - kfree(param->path_name); - kfree(param->node_name); + __free_dfs_info_param(param); + memset(param, 0, sizeof(*param)); } } @@ -1775,6 +1807,7 @@ static inline bool is_retryable_error(int error) #define MID_RETRY_NEEDED 8 /* session closed while this request out */ #define MID_RESPONSE_MALFORMED 0x10 #define MID_SHUTDOWN 0x20 +#define MID_RESPONSE_READY 0x40 /* ready for other process handle the rsp */ /* Flags */ #define MID_WAIT_CANCELLED 1 /* Cancelled while waiting for response */ @@ -1911,7 +1944,7 @@ require use of the stronger protocol */ * cifsInodeInfo->lock_sem cifsInodeInfo->llist cifs_init_once * ->can_cache_brlcks * cifsInodeInfo->deferred_lock cifsInodeInfo->deferred_closes cifsInodeInfo_alloc - * cached_fid->fid_mutex cifs_tcon->crfid tconInfoAlloc + * cached_fid->fid_mutex cifs_tcon->crfid tcon_info_alloc * cifsFileInfo->fh_mutex cifsFileInfo cifs_new_fileinfo * cifsFileInfo->file_info_lock cifsFileInfo->count cifs_new_fileinfo * ->invalidHandle initiate_cifs_search @@ -1985,6 +2018,7 @@ extern unsigned int CIFSMaxBufSize; /* max size not including hdr */ extern unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ extern unsigned int cifs_min_small; /* min size of small buf pool */ extern unsigned int cifs_max_pending; /* MAX requests at once to server*/ +extern unsigned int dir_cache_timeout; /* max time for directory lease caching of dir */ extern bool disable_legacy_dialects; /* forbid vers=1.0 and vers=2.0 mounts */ extern atomic_t mid_count; @@ -2184,4 +2218,17 @@ static inline void cifs_sg_set_buf(struct sg_table *sgtable, } } +struct smb2_compound_vars { + struct cifs_open_parms oparms; + struct kvec rsp_iov[3]; + struct smb_rqst rqst[3]; + struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; + struct kvec qi_iov; + struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; + struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; + struct kvec close_iov; + struct smb2_file_rename_info rename_info; + struct smb2_file_link_info link_info; +}; + #endif /* _CIFS_GLOB_H */ diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 1d71d658e167..0c37eefa18a5 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -207,6 +207,9 @@ extern struct inode *cifs_iget(struct super_block *sb, int cifs_get_inode_info(struct inode **inode, const char *full_path, struct cifs_open_info_data *data, struct super_block *sb, int xid, const struct cifs_fid *fid); +bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, + struct cifs_fattr *fattr, + u32 tag); extern int smb311_posix_get_inode_info(struct inode **pinode, const char *search_path, struct super_block *sb, unsigned int xid); extern int cifs_get_inode_info_unix(struct inode **pinode, @@ -295,11 +298,7 @@ extern void cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect); extern void cifs_put_tcon(struct cifs_tcon *tcon); -#if IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) -extern void cifs_dfs_release_automount_timer(void); -#else /* ! IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) */ -#define cifs_dfs_release_automount_timer() do { } while (0) -#endif /* ! IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) */ +extern void cifs_release_automount_timer(void); void cifs_proc_init(void); void cifs_proc_clean(void); @@ -513,7 +512,7 @@ extern int CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses); extern struct cifs_ses *sesInfoAlloc(void); extern void sesInfoFree(struct cifs_ses *); -extern struct cifs_tcon *tconInfoAlloc(void); +extern struct cifs_tcon *tcon_info_alloc(bool dir_leases_enabled); extern void tconInfoFree(struct cifs_tcon *); extern int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server, diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 238538dde4e3..3902e90dca6b 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -154,7 +154,7 @@ cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server, int i; /* If server is a channel, select the primary channel */ - pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; spin_lock(&pserver->srv_lock); if (!all_channels) { @@ -202,7 +202,7 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, cifs_dbg(FYI, "%s: marking necessary sessions and tcons for reconnect\n", __func__); /* If server is a channel, select the primary channel */ - pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; spin_lock(&cifs_tcp_ses_lock); @@ -453,10 +453,10 @@ static int reconnect_target_unlocked(struct TCP_Server_Info *server, struct dfs_ static int reconnect_dfs_server(struct TCP_Server_Info *server) { - int rc = 0; - struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl); struct dfs_cache_tgt_iterator *target_hint = NULL; + DFS_CACHE_TGT_LIST(tl); int num_targets = 0; + int rc = 0; /* * Determine the number of dfs targets the referral path in @cifs_sb resolves to. @@ -911,8 +911,8 @@ cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required) return 0; } - -static void clean_demultiplex_info(struct TCP_Server_Info *server) +static noinline_for_stack void +clean_demultiplex_info(struct TCP_Server_Info *server) { int length; @@ -1551,7 +1551,7 @@ cifs_find_tcp_session(struct smb3_fs_context *ctx) * Skip ses channels since they're only handled in lower layers * (e.g. cifs_send_recv). */ - if (CIFS_SERVER_IS_CHAN(server) || + if (SERVER_IS_CHAN(server) || !match_server(server, ctx, false)) { spin_unlock(&server->srv_lock); continue; @@ -1587,7 +1587,7 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) spin_unlock(&cifs_tcp_ses_lock); /* For secondary channels, we pick up ref-count on the primary server */ - if (CIFS_SERVER_IS_CHAN(server)) + if (SERVER_IS_CHAN(server)) cifs_put_tcp_session(server->primary_server, from_reconnect); cancel_delayed_work_sync(&server->echo); @@ -1686,6 +1686,7 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, ctx->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); tcp_ses->session_estab = false; tcp_ses->sequence_number = 0; + tcp_ses->channel_sequence_num = 0; /* only tracked for primary channel */ tcp_ses->reconnect_instance = 1; tcp_ses->lstrp = jiffies; tcp_ses->compress_algorithm = cpu_to_le16(ctx->compression); @@ -1792,7 +1793,7 @@ out_err_crypto_release: out_err: if (tcp_ses) { - if (CIFS_SERVER_IS_CHAN(tcp_ses)) + if (SERVER_IS_CHAN(tcp_ses)) cifs_put_tcp_session(tcp_ses->primary_server, false); kfree(tcp_ses->hostname); kfree(tcp_ses->leaf_fullpath); @@ -1881,7 +1882,8 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx) } } - tcon = tconInfoAlloc(); + /* no need to setup directory caching on IPC share, so pass in false */ + tcon = tcon_info_alloc(false); if (tcon == NULL) return -ENOMEM; @@ -2491,7 +2493,10 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) goto out_fail; } - tcon = tconInfoAlloc(); + if (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING) + tcon = tcon_info_alloc(true); + else + tcon = tcon_info_alloc(false); if (tcon == NULL) { rc = -ENOMEM; goto out_fail; @@ -2656,6 +2661,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) tcon->retry = ctx->retry; tcon->nocase = ctx->nocase; tcon->broken_sparse_sup = ctx->no_sparse; + tcon->max_cached_dirs = ctx->max_cached_dirs; if (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING) tcon->nohandlecache = ctx->nohandlecache; else @@ -3813,7 +3819,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, struct nls_table *nls_info) { int rc = -ENOSYS; - struct TCP_Server_Info *pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + struct TCP_Server_Info *pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&pserver->dstaddr; struct sockaddr_in *addr = (struct sockaddr_in *)&pserver->dstaddr; bool is_binding = false; diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c index ee772c3d9f00..81b84151450d 100644 --- a/fs/smb/client/dfs.c +++ b/fs/smb/client/dfs.c @@ -3,7 +3,6 @@ * Copyright (c) 2022 Paulo Alcantara <palcantara@suse.de> */ -#include <linux/namei.h> #include "cifsproto.h" #include "cifs_debug.h" #include "dns_resolve.h" @@ -96,51 +95,134 @@ static int add_root_smb_session(struct cifs_mount_ctx *mnt_ctx) return 0; } -static int get_dfs_conn(struct cifs_mount_ctx *mnt_ctx, const char *ref_path, const char *full_path, - const struct dfs_cache_tgt_iterator *tit) +static inline int parse_dfs_target(struct smb3_fs_context *ctx, + struct dfs_ref_walk *rw, + struct dfs_info3_param *tgt) +{ + int rc; + const char *fpath = ref_walk_fpath(rw) + 1; + + rc = ref_walk_get_tgt(rw, tgt); + if (!rc) + rc = dfs_parse_target_referral(fpath, tgt, ctx); + return rc; +} + +static int set_ref_paths(struct cifs_mount_ctx *mnt_ctx, + struct dfs_info3_param *tgt, + struct dfs_ref_walk *rw) { struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; - struct dfs_info3_param ref = {}; - bool is_refsrv; - int rc, rc2; + struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb; + char *ref_path, *full_path; + int rc; - rc = dfs_cache_get_tgt_referral(ref_path + 1, tit, &ref); - if (rc) + full_path = smb3_fs_context_fullpath(ctx, CIFS_DIR_SEP(cifs_sb)); + if (IS_ERR(full_path)) + return PTR_ERR(full_path); + + if (!tgt || (tgt->server_type == DFS_TYPE_LINK && + DFS_INTERLINK(tgt->flags))) + ref_path = dfs_get_path(cifs_sb, ctx->UNC); + else + ref_path = dfs_get_path(cifs_sb, full_path); + if (IS_ERR(ref_path)) { + rc = PTR_ERR(ref_path); + kfree(full_path); return rc; + } + ref_walk_path(rw) = ref_path; + ref_walk_fpath(rw) = full_path; + return 0; +} - rc = dfs_parse_target_referral(full_path + 1, &ref, ctx); - if (rc) - goto out; +static int __dfs_referral_walk(struct cifs_mount_ctx *mnt_ctx, + struct dfs_ref_walk *rw) +{ + struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; + struct dfs_info3_param tgt = {}; + bool is_refsrv; + int rc = -ENOENT; - cifs_mount_put_conns(mnt_ctx); - rc = get_session(mnt_ctx, ref_path); - if (rc) - goto out; +again: + do { + if (ref_walk_empty(rw)) { + rc = dfs_get_referral(mnt_ctx, ref_walk_path(rw) + 1, + NULL, ref_walk_tl(rw)); + if (rc) { + rc = cifs_mount_get_tcon(mnt_ctx); + if (!rc) + rc = cifs_is_path_remote(mnt_ctx); + continue; + } + if (!ref_walk_num_tgts(rw)) { + rc = -ENOENT; + continue; + } + } - is_refsrv = !!(ref.flags & DFSREF_REFERRAL_SERVER); + while (ref_walk_next_tgt(rw)) { + rc = parse_dfs_target(ctx, rw, &tgt); + if (rc) + continue; - rc = -EREMOTE; - if (ref.flags & DFSREF_STORAGE_SERVER) { - rc = cifs_mount_get_tcon(mnt_ctx); - if (rc) - goto out; + cifs_mount_put_conns(mnt_ctx); + rc = get_session(mnt_ctx, ref_walk_path(rw)); + if (rc) + continue; - /* some servers may not advertise referral capability under ref.flags */ - is_refsrv |= is_tcon_dfs(mnt_ctx->tcon); + is_refsrv = tgt.server_type == DFS_TYPE_ROOT || + DFS_INTERLINK(tgt.flags); + ref_walk_set_tgt_hint(rw); - rc = cifs_is_path_remote(mnt_ctx); - } + if (tgt.flags & DFSREF_STORAGE_SERVER) { + rc = cifs_mount_get_tcon(mnt_ctx); + if (!rc) + rc = cifs_is_path_remote(mnt_ctx); + if (!rc) + break; + if (rc != -EREMOTE) + continue; + } - dfs_cache_noreq_update_tgthint(ref_path + 1, tit); + if (is_refsrv) { + rc = add_root_smb_session(mnt_ctx); + if (rc) + goto out; + } - if (rc == -EREMOTE && is_refsrv) { - rc2 = add_root_smb_session(mnt_ctx); - if (rc2) - rc = rc2; - } + rc = ref_walk_advance(rw); + if (!rc) { + rc = set_ref_paths(mnt_ctx, &tgt, rw); + if (!rc) { + rc = -EREMOTE; + goto again; + } + } + if (rc != -ELOOP) + goto out; + } + } while (rc && ref_walk_descend(rw)); out: - free_dfs_info_param(&ref); + free_dfs_info_param(&tgt); + return rc; +} + +static int dfs_referral_walk(struct cifs_mount_ctx *mnt_ctx) +{ + struct dfs_ref_walk *rw; + int rc; + + rw = ref_walk_alloc(); + if (IS_ERR(rw)) + return PTR_ERR(rw); + + ref_walk_init(rw); + rc = set_ref_paths(mnt_ctx, NULL, rw); + if (!rc) + rc = __dfs_referral_walk(mnt_ctx, rw); + ref_walk_free(rw); return rc; } @@ -148,105 +230,48 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx) { struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb; struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; - char *ref_path = NULL, *full_path = NULL; - struct dfs_cache_tgt_iterator *tit; struct cifs_tcon *tcon; - char *origin_fullpath = NULL; - char sep = CIFS_DIR_SEP(cifs_sb); - int num_links = 0; + char *origin_fullpath; int rc; - ref_path = dfs_get_path(cifs_sb, ctx->UNC); - if (IS_ERR(ref_path)) - return PTR_ERR(ref_path); + origin_fullpath = dfs_get_path(cifs_sb, ctx->source); + if (IS_ERR(origin_fullpath)) + return PTR_ERR(origin_fullpath); - full_path = smb3_fs_context_fullpath(ctx, sep); - if (IS_ERR(full_path)) { - rc = PTR_ERR(full_path); - full_path = NULL; + rc = dfs_referral_walk(mnt_ctx); + if (rc) goto out; - } - origin_fullpath = kstrdup(full_path, GFP_KERNEL); - if (!origin_fullpath) { - rc = -ENOMEM; - goto out; + tcon = mnt_ctx->tcon; + spin_lock(&tcon->tc_lock); + if (!tcon->origin_fullpath) { + tcon->origin_fullpath = origin_fullpath; + origin_fullpath = NULL; } + spin_unlock(&tcon->tc_lock); - do { - struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl); - - rc = dfs_get_referral(mnt_ctx, ref_path + 1, NULL, &tl); - if (rc) { - rc = cifs_mount_get_tcon(mnt_ctx); - if (!rc) - rc = cifs_is_path_remote(mnt_ctx); - break; - } - - tit = dfs_cache_get_tgt_iterator(&tl); - if (!tit) { - cifs_dbg(VFS, "%s: dfs referral (%s) with no targets\n", __func__, - ref_path + 1); - rc = -ENOENT; - dfs_cache_free_tgts(&tl); - break; - } - - do { - rc = get_dfs_conn(mnt_ctx, ref_path, full_path, tit); - if (!rc) - break; - if (rc == -EREMOTE) { - if (++num_links > MAX_NESTED_LINKS) { - rc = -ELOOP; - break; - } - kfree(ref_path); - kfree(full_path); - ref_path = full_path = NULL; - - full_path = smb3_fs_context_fullpath(ctx, sep); - if (IS_ERR(full_path)) { - rc = PTR_ERR(full_path); - full_path = NULL; - } else { - ref_path = dfs_get_path(cifs_sb, full_path); - if (IS_ERR(ref_path)) { - rc = PTR_ERR(ref_path); - ref_path = NULL; - } - } - break; - } - } while ((tit = dfs_cache_get_next_tgt(&tl, tit))); - dfs_cache_free_tgts(&tl); - } while (rc == -EREMOTE); - - if (!rc) { - tcon = mnt_ctx->tcon; - - spin_lock(&tcon->tc_lock); - if (!tcon->origin_fullpath) { - tcon->origin_fullpath = origin_fullpath; - origin_fullpath = NULL; - } - spin_unlock(&tcon->tc_lock); - - if (list_empty(&tcon->dfs_ses_list)) { - list_replace_init(&mnt_ctx->dfs_ses_list, - &tcon->dfs_ses_list); - queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work, - dfs_cache_get_ttl() * HZ); - } else { - dfs_put_root_smb_sessions(&mnt_ctx->dfs_ses_list); - } + if (list_empty(&tcon->dfs_ses_list)) { + list_replace_init(&mnt_ctx->dfs_ses_list, &tcon->dfs_ses_list); + queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work, + dfs_cache_get_ttl() * HZ); + } else { + dfs_put_root_smb_sessions(&mnt_ctx->dfs_ses_list); } out: kfree(origin_fullpath); - kfree(ref_path); - kfree(full_path); + return rc; +} + +/* Resolve UNC hostname in @ctx->source and set ip addr in @ctx->dstaddr */ +static int update_fs_context_dstaddr(struct smb3_fs_context *ctx) +{ + struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr; + int rc; + + rc = dns_resolve_server_name_to_ip(ctx->source, addr, NULL); + if (!rc) + cifs_set_port(addr, ctx->port); return rc; } @@ -256,6 +281,10 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs) bool nodfs = ctx->nodfs; int rc; + rc = update_fs_context_dstaddr(ctx); + if (rc) + return rc; + *isdfs = false; rc = get_session(mnt_ctx, NULL); if (rc) @@ -426,7 +455,7 @@ static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *t /* Try to tree connect to all dfs targets */ for (; tit; tit = dfs_cache_get_next_tgt(tl, tit)) { const char *target = dfs_cache_get_tgt_name(tit); - struct dfs_cache_tgt_list ntl = DFS_CACHE_TGT_LIST_INIT(ntl); + DFS_CACHE_TGT_LIST(ntl); kfree(share); kfree(prefix); @@ -520,7 +549,7 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru int rc; struct TCP_Server_Info *server = tcon->ses->server; const struct smb_version_operations *ops = server->ops; - struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl); + DFS_CACHE_TGT_LIST(tl); struct cifs_sb_info *cifs_sb = NULL; struct super_block *sb = NULL; struct dfs_info3_param ref = {0}; diff --git a/fs/smb/client/dfs.h b/fs/smb/client/dfs.h index 98e9d2aca6a7..875ab7ae57fc 100644 --- a/fs/smb/client/dfs.h +++ b/fs/smb/client/dfs.h @@ -9,6 +9,110 @@ #include "cifsglob.h" #include "fs_context.h" #include "cifs_unicode.h" +#include <linux/namei.h> + +#define DFS_INTERLINK(v) \ + (((v) & DFSREF_REFERRAL_SERVER) && !((v) & DFSREF_STORAGE_SERVER)) + +struct dfs_ref { + char *path; + char *full_path; + struct dfs_cache_tgt_list tl; + struct dfs_cache_tgt_iterator *tit; +}; + +struct dfs_ref_walk { + struct dfs_ref *ref; + struct dfs_ref refs[MAX_NESTED_LINKS]; +}; + +#define ref_walk_start(w) ((w)->refs) +#define ref_walk_end(w) (&(w)->refs[ARRAY_SIZE((w)->refs) - 1]) +#define ref_walk_cur(w) ((w)->ref) +#define ref_walk_descend(w) (--ref_walk_cur(w) >= ref_walk_start(w)) + +#define ref_walk_tit(w) (ref_walk_cur(w)->tit) +#define ref_walk_empty(w) (!ref_walk_tit(w)) +#define ref_walk_path(w) (ref_walk_cur(w)->path) +#define ref_walk_fpath(w) (ref_walk_cur(w)->full_path) +#define ref_walk_tl(w) (&ref_walk_cur(w)->tl) + +static inline struct dfs_ref_walk *ref_walk_alloc(void) +{ + struct dfs_ref_walk *rw; + + rw = kmalloc(sizeof(*rw), GFP_KERNEL); + if (!rw) + return ERR_PTR(-ENOMEM); + return rw; +} + +static inline void ref_walk_init(struct dfs_ref_walk *rw) +{ + memset(rw, 0, sizeof(*rw)); + ref_walk_cur(rw) = ref_walk_start(rw); +} + +static inline void __ref_walk_free(struct dfs_ref *ref) +{ + kfree(ref->path); + kfree(ref->full_path); + dfs_cache_free_tgts(&ref->tl); + memset(ref, 0, sizeof(*ref)); +} + +static inline void ref_walk_free(struct dfs_ref_walk *rw) +{ + struct dfs_ref *ref = ref_walk_start(rw); + + for (; ref <= ref_walk_end(rw); ref++) + __ref_walk_free(ref); + kfree(rw); +} + +static inline int ref_walk_advance(struct dfs_ref_walk *rw) +{ + struct dfs_ref *ref = ref_walk_cur(rw) + 1; + + if (ref > ref_walk_end(rw)) + return -ELOOP; + __ref_walk_free(ref); + ref_walk_cur(rw) = ref; + return 0; +} + +static inline struct dfs_cache_tgt_iterator * +ref_walk_next_tgt(struct dfs_ref_walk *rw) +{ + struct dfs_cache_tgt_iterator *tit; + struct dfs_ref *ref = ref_walk_cur(rw); + + if (!ref->tit) + tit = dfs_cache_get_tgt_iterator(&ref->tl); + else + tit = dfs_cache_get_next_tgt(&ref->tl, ref->tit); + ref->tit = tit; + return tit; +} + +static inline int ref_walk_get_tgt(struct dfs_ref_walk *rw, + struct dfs_info3_param *tgt) +{ + zfree_dfs_info_param(tgt); + return dfs_cache_get_tgt_referral(ref_walk_path(rw) + 1, + ref_walk_tit(rw), tgt); +} + +static inline int ref_walk_num_tgts(struct dfs_ref_walk *rw) +{ + return dfs_cache_get_nr_tgts(ref_walk_tl(rw)); +} + +static inline void ref_walk_set_tgt_hint(struct dfs_ref_walk *rw) +{ + dfs_cache_noreq_update_tgthint(ref_walk_path(rw) + 1, + ref_walk_tit(rw)); +} struct dfs_root_ses { struct list_head list; @@ -34,43 +138,6 @@ static inline int dfs_get_referral(struct cifs_mount_ctx *mnt_ctx, const char *p cifs_remap(cifs_sb), path, ref, tl); } -/* Return DFS full path out of a dentry set for automount */ -static inline char *dfs_get_automount_devname(struct dentry *dentry, void *page) -{ - struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb); - struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); - size_t len; - char *s; - - spin_lock(&tcon->tc_lock); - if (unlikely(!tcon->origin_fullpath)) { - spin_unlock(&tcon->tc_lock); - return ERR_PTR(-EREMOTE); - } - spin_unlock(&tcon->tc_lock); - - s = dentry_path_raw(dentry, page, PATH_MAX); - if (IS_ERR(s)) - return s; - /* for root, we want "" */ - if (!s[1]) - s++; - - spin_lock(&tcon->tc_lock); - len = strlen(tcon->origin_fullpath); - if (s < (char *)page + len) { - spin_unlock(&tcon->tc_lock); - return ERR_PTR(-ENAMETOOLONG); - } - - s -= len; - memcpy(s, tcon->origin_fullpath, len); - spin_unlock(&tcon->tc_lock); - convert_delimiter(s, '/'); - - return s; -} - static inline void dfs_put_root_smb_sessions(struct list_head *head) { struct dfs_root_ses *root, *tmp; diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c index 33adf43a01f1..508d831fabe3 100644 --- a/fs/smb/client/dfs_cache.c +++ b/fs/smb/client/dfs_cache.c @@ -29,8 +29,6 @@ #define CACHE_MIN_TTL 120 /* 2 minutes */ #define CACHE_DEFAULT_TTL 300 /* 5 minutes */ -#define IS_DFS_INTERLINK(v) (((v) & DFSREF_REFERRAL_SERVER) && !((v) & DFSREF_STORAGE_SERVER)) - struct cache_dfs_tgt { char *name; int path_consumed; @@ -174,7 +172,7 @@ static int dfscache_proc_show(struct seq_file *m, void *v) "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,hdr_flags=0x%x,ref_flags=0x%x,interlink=%s,path_consumed=%d,expired=%s\n", ce->path, ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ttl, ce->etime.tv_nsec, ce->hdr_flags, ce->ref_flags, - IS_DFS_INTERLINK(ce->hdr_flags) ? "yes" : "no", + DFS_INTERLINK(ce->hdr_flags) ? "yes" : "no", ce->path_consumed, cache_entry_expired(ce) ? "yes" : "no"); list_for_each_entry(t, &ce->tlist, list) { @@ -243,7 +241,7 @@ static inline void dump_ce(const struct cache_entry *ce) ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ttl, ce->etime.tv_nsec, ce->hdr_flags, ce->ref_flags, - IS_DFS_INTERLINK(ce->hdr_flags) ? "yes" : "no", + DFS_INTERLINK(ce->hdr_flags) ? "yes" : "no", ce->path_consumed, cache_entry_expired(ce) ? "yes" : "no"); dump_tgts(ce); @@ -1177,9 +1175,9 @@ static bool is_ses_good(struct cifs_ses *ses) /* Refresh dfs referral of tcon and mark it for reconnect if needed */ static int __refresh_tcon(const char *path, struct cifs_ses *ses, bool force_refresh) { - struct dfs_cache_tgt_list old_tl = DFS_CACHE_TGT_LIST_INIT(old_tl); - struct dfs_cache_tgt_list new_tl = DFS_CACHE_TGT_LIST_INIT(new_tl); struct TCP_Server_Info *server = ses->server; + DFS_CACHE_TGT_LIST(old_tl); + DFS_CACHE_TGT_LIST(new_tl); bool needs_refresh = false; struct cache_entry *ce; unsigned int xid; diff --git a/fs/smb/client/dfs_cache.h b/fs/smb/client/dfs_cache.h index c6d89cd6d4fd..18a08a2ca93b 100644 --- a/fs/smb/client/dfs_cache.h +++ b/fs/smb/client/dfs_cache.h @@ -16,7 +16,11 @@ extern struct workqueue_struct *dfscache_wq; extern atomic_t dfs_cache_ttl; -#define DFS_CACHE_TGT_LIST_INIT(var) { .tl_numtgts = 0, .tl_list = LIST_HEAD_INIT((var).tl_list), } +#define DFS_CACHE_TGT_LIST_INIT(var) \ + { .tl_numtgts = 0, .tl_list = LIST_HEAD_INIT((var).tl_list), } + +#define DFS_CACHE_TGT_LIST(var) \ + struct dfs_cache_tgt_list var = DFS_CACHE_TGT_LIST_INIT(var) struct dfs_cache_tgt_list { int tl_numtgts; @@ -51,8 +55,8 @@ static inline struct dfs_cache_tgt_iterator * dfs_cache_get_next_tgt(struct dfs_cache_tgt_list *tl, struct dfs_cache_tgt_iterator *it) { - if (!tl || list_empty(&tl->tl_list) || !it || - list_is_last(&it->it_list, &tl->tl_list)) + if (!tl || !tl->tl_numtgts || list_empty(&tl->tl_list) || + !it || list_is_last(&it->it_list, &tl->tl_list)) return NULL; return list_next_entry(it, it_list); } @@ -71,7 +75,7 @@ static inline void dfs_cache_free_tgts(struct dfs_cache_tgt_list *tl) { struct dfs_cache_tgt_iterator *it, *nit; - if (!tl || list_empty(&tl->tl_list)) + if (!tl || !tl->tl_numtgts || list_empty(&tl->tl_list)) return; list_for_each_entry_safe(it, nit, &tl->tl_list, it_list) { list_del(&it->it_list); diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c index 30b1e1bfd204..580a27a3a7e6 100644 --- a/fs/smb/client/dir.c +++ b/fs/smb/client/dir.c @@ -797,7 +797,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags) const struct dentry_operations cifs_dentry_ops = { .d_revalidate = cifs_d_revalidate, - .d_automount = cifs_dfs_d_automount, + .d_automount = cifs_d_automount, /* d_delete: cifs_d_delete, */ /* not needed except for debugging */ }; @@ -872,5 +872,5 @@ const struct dentry_operations cifs_ci_dentry_ops = { .d_revalidate = cifs_d_revalidate, .d_hash = cifs_ci_hash, .d_compare = cifs_ci_compare, - .d_automount = cifs_dfs_d_automount, + .d_automount = cifs_d_automount, }; diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 6bc44f79d2e9..2108b3b40ce9 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -1085,7 +1085,7 @@ int cifs_close(struct inode *inode, struct file *file) !test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags) && dclose) { if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) { - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); } spin_lock(&cinode->deferred_lock); cifs_add_deferred_close(cfile, dclose); @@ -2596,7 +2596,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) write_data, to - from, &offset); cifsFileInfo_put(open_file); /* Does mm or vfs already set times? */ - inode->i_atime = inode->i_mtime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); if ((bytes_written > 0) && (offset)) rc = 0; else if (bytes_written < 0) diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index 67e16c2ac90e..e45ce31bbda7 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -150,6 +150,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_u32("closetimeo", Opt_closetimeo), fsparam_u32("echo_interval", Opt_echo_interval), fsparam_u32("max_credits", Opt_max_credits), + fsparam_u32("max_cached_dirs", Opt_max_cached_dirs), fsparam_u32("handletimeout", Opt_handletimeout), fsparam_u64("snapshot", Opt_snapshot), fsparam_u32("max_channels", Opt_max_channels), @@ -1165,6 +1166,14 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, if (result.uint_32 > 1) ctx->multichannel = true; break; + case Opt_max_cached_dirs: + if (result.uint_32 < 1) { + cifs_errorf(fc, "%s: Invalid max_cached_dirs, needs to be 1 or more\n", + __func__); + goto cifs_parse_mount_err; + } + ctx->max_cached_dirs = result.uint_32; + break; case Opt_handletimeout: ctx->handle_timeout = result.uint_32; if (ctx->handle_timeout > SMB3_MAX_HANDLE_TIMEOUT) { @@ -1592,7 +1601,7 @@ int smb3_init_fs_context(struct fs_context *fc) ctx->acregmax = CIFS_DEF_ACTIMEO; ctx->acdirmax = CIFS_DEF_ACTIMEO; ctx->closetimeo = SMB3_DEF_DCLOSETIMEO; - + ctx->max_cached_dirs = MAX_CACHED_FIDS; /* Most clients set timeout to 0, allows server to use its default */ ctx->handle_timeout = 0; /* See MS-SMB2 spec section 2.2.14.2.12 */ diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h index f4eaf8558902..9d8d34af0211 100644 --- a/fs/smb/client/fs_context.h +++ b/fs/smb/client/fs_context.h @@ -128,6 +128,7 @@ enum cifs_param { Opt_closetimeo, Opt_echo_interval, Opt_max_credits, + Opt_max_cached_dirs, Opt_snapshot, Opt_max_channels, Opt_handletimeout, @@ -261,6 +262,7 @@ struct smb3_fs_context { __u32 handle_timeout; /* persistent and durable handle timeout in ms */ unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */ unsigned int max_channels; + unsigned int max_cached_dirs; __u16 compression; /* compression algorithm 0xFFFF default 0=disabled */ bool rootfs:1; /* if it's a SMB root file system */ bool witness:1; /* use witness protocol */ @@ -287,7 +289,7 @@ extern void smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb); */ #define SMB3_MAX_DCLOSETIMEO (1 << 30) #define SMB3_DEF_DCLOSETIMEO (1 * HZ) /* even 1 sec enough to help eg open/write/close/open/read */ - +#define MAX_CACHED_FIDS 16 extern char *cifs_sanitize_prepath(char *prepath, gfp_t gfp); #endif diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c index 8f6909d633da..e5cad149f5a2 100644 --- a/fs/smb/client/fscache.c +++ b/fs/smb/client/fscache.c @@ -48,7 +48,7 @@ int cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) sharename = extract_sharename(tcon->tree_name); if (IS_ERR(sharename)) { cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__); - return -EINVAL; + return PTR_ERR(sharename); } slen = strlen(sharename); @@ -108,6 +108,8 @@ void cifs_fscache_get_inode_cookie(struct inode *inode) &cifsi->uniqueid, sizeof(cifsi->uniqueid), &cd, sizeof(cd), i_size_read(&cifsi->netfs.inode)); + if (cifsi->netfs.cache) + mapping_set_release_always(inode->i_mapping); } void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) diff --git a/fs/smb/client/fscache.h b/fs/smb/client/fscache.h index 173999610997..84f3b09367d2 100644 --- a/fs/smb/client/fscache.h +++ b/fs/smb/client/fscache.h @@ -50,12 +50,13 @@ void cifs_fscache_fill_coherency(struct inode *inode, struct cifs_fscache_inode_coherency_data *cd) { struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct timespec64 ctime = inode_get_ctime(inode); memset(cd, 0, sizeof(*cd)); cd->last_write_time_sec = cpu_to_le64(cifsi->netfs.inode.i_mtime.tv_sec); cd->last_write_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_mtime.tv_nsec); - cd->last_change_time_sec = cpu_to_le64(cifsi->netfs.inode.i_ctime.tv_sec); - cd->last_change_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_ctime.tv_nsec); + cd->last_change_time_sec = cpu_to_le64(ctime.tv_sec); + cd->last_change_time_nsec = cpu_to_le32(ctime.tv_nsec); } diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index c3eeae07e139..d7c302442c1e 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -58,13 +58,9 @@ static void cifs_set_ops(struct inode *inode) inode->i_data.a_ops = &cifs_addr_ops; break; case S_IFDIR: -#ifdef CONFIG_CIFS_DFS_UPCALL if (IS_AUTOMOUNT(inode)) { - inode->i_op = &cifs_dfs_referral_inode_operations; + inode->i_op = &cifs_namespace_inode_operations; } else { -#else /* NO DFS support, treat as a directory */ - { -#endif inode->i_op = &cifs_dir_inode_ops; inode->i_fop = &cifs_dir_ops; } @@ -172,7 +168,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) else inode->i_atime = fattr->cf_atime; inode->i_mtime = fattr->cf_mtime; - inode->i_ctime = fattr->cf_ctime; + inode_set_ctime_to_ts(inode, fattr->cf_ctime); inode->i_rdev = fattr->cf_rdev; cifs_nlink_fattr_to_inode(inode, fattr); inode->i_uid = fattr->cf_uid; @@ -218,7 +214,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) } spin_unlock(&inode->i_lock); - if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL) + if (fattr->cf_flags & CIFS_FATTR_JUNCTION) inode->i_flags |= S_AUTOMOUNT; if (inode->i_state & I_NEW) cifs_set_ops(inode); @@ -327,14 +323,14 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, * * Needed to setup cifs_fattr data for the directory which is the * junction to the new submount (ie to setup the fake directory - * which represents a DFS referral). + * which represents a DFS referral or reparse mount point). */ -static void -cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb) +static void cifs_create_junction_fattr(struct cifs_fattr *fattr, + struct super_block *sb) { struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - cifs_dbg(FYI, "creating fake fattr for DFS referral\n"); + cifs_dbg(FYI, "%s: creating fake fattr\n", __func__); memset(fattr, 0, sizeof(*fattr)); fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU; @@ -343,7 +339,33 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb) ktime_get_coarse_real_ts64(&fattr->cf_mtime); fattr->cf_atime = fattr->cf_ctime = fattr->cf_mtime; fattr->cf_nlink = 2; - fattr->cf_flags = CIFS_FATTR_DFS_REFERRAL; + fattr->cf_flags = CIFS_FATTR_JUNCTION; +} + +/* Update inode with final fattr data */ +static int update_inode_info(struct super_block *sb, + struct cifs_fattr *fattr, + struct inode **inode) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + int rc = 0; + + if (!*inode) { + *inode = cifs_iget(sb, fattr); + if (!*inode) + rc = -ENOMEM; + return rc; + } + /* We already have inode, update it. + * + * If file type or uniqueid is different, return error. + */ + if (unlikely((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) && + CIFS_I(*inode)->uniqueid != fattr->cf_uniqueid)) { + CIFS_I(*inode)->time = 0; /* force reval */ + return -ESTALE; + } + return cifs_fattr_to_inode(*inode, fattr); } #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY @@ -373,7 +395,7 @@ cifs_get_file_info_unix(struct file *filp) if (!rc) { cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb); } else if (rc == -EREMOTE) { - cifs_create_dfs_fattr(&fattr, inode->i_sb); + cifs_create_junction_fattr(&fattr, inode->i_sb); rc = 0; } else goto cifs_gfiunix_out; @@ -385,17 +407,18 @@ cifs_gfiunix_out: return rc; } -int cifs_get_inode_info_unix(struct inode **pinode, - const unsigned char *full_path, - struct super_block *sb, unsigned int xid) +static int cifs_get_unix_fattr(const unsigned char *full_path, + struct super_block *sb, + struct cifs_fattr *fattr, + struct inode **pinode, + const unsigned int xid) { - int rc; + struct TCP_Server_Info *server; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); FILE_UNIX_BASIC_INFO find_data; - struct cifs_fattr fattr; struct cifs_tcon *tcon; - struct TCP_Server_Info *server; struct tcon_link *tlink; - struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + int rc, tmprc; cifs_dbg(FYI, "Getting info on %s\n", full_path); @@ -412,59 +435,61 @@ int cifs_get_inode_info_unix(struct inode **pinode, cifs_put_tlink(tlink); if (!rc) { - cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb); + cifs_unix_basic_to_fattr(fattr, &find_data, cifs_sb); } else if (rc == -EREMOTE) { - cifs_create_dfs_fattr(&fattr, sb); + cifs_create_junction_fattr(fattr, sb); rc = 0; } else { return rc; } + if (!*pinode) + cifs_fill_uniqueid(sb, fattr); + /* check for Minshall+French symlinks */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { - int tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr, - full_path); - if (tmprc) - cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); + tmprc = check_mf_symlink(xid, tcon, cifs_sb, fattr, full_path); + cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); } - if (S_ISLNK(fattr.cf_mode) && !fattr.cf_symlink_target) { + if (S_ISLNK(fattr->cf_mode) && !fattr->cf_symlink_target) { if (!server->ops->query_symlink) return -EOPNOTSUPP; - rc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path, - &fattr.cf_symlink_target, false); - if (rc) { - cifs_dbg(FYI, "%s: query_symlink: %d\n", __func__, rc); - goto cgiiu_exit; - } + rc = server->ops->query_symlink(xid, tcon, + cifs_sb, full_path, + &fattr->cf_symlink_target, + NULL); + cifs_dbg(FYI, "%s: query_symlink: %d\n", __func__, rc); } + return rc; +} - if (*pinode == NULL) { - /* get new inode */ - cifs_fill_uniqueid(sb, &fattr); - *pinode = cifs_iget(sb, &fattr); - if (!*pinode) - rc = -ENOMEM; - } else { - /* we already have inode, update it */ - - /* if uniqueid is different, return error */ - if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM && - CIFS_I(*pinode)->uniqueid != fattr.cf_uniqueid)) { - CIFS_I(*pinode)->time = 0; /* force reval */ - rc = -ESTALE; - goto cgiiu_exit; - } +int cifs_get_inode_info_unix(struct inode **pinode, + const unsigned char *full_path, + struct super_block *sb, unsigned int xid) +{ + struct cifs_fattr fattr = {}; + int rc; - /* if filetype is different, return error */ - rc = cifs_fattr_to_inode(*pinode, &fattr); - } + rc = cifs_get_unix_fattr(full_path, sb, &fattr, pinode, xid); + if (rc) + goto out; -cgiiu_exit: + rc = update_inode_info(sb, &fattr, pinode); +out: kfree(fattr.cf_symlink_target); return rc; } #else +static inline int cifs_get_unix_fattr(const unsigned char *full_path, + struct super_block *sb, + struct cifs_fattr *fattr, + struct inode **pinode, + const unsigned int xid) +{ + return -EOPNOTSUPP; +} + int cifs_get_inode_info_unix(struct inode **pinode, const unsigned char *full_path, struct super_block *sb, unsigned int xid) @@ -632,10 +657,11 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, } /* Fill a cifs_fattr struct with info from POSIX info struct */ -static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data, +static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, + struct cifs_open_info_data *data, struct cifs_sid *owner, struct cifs_sid *group, - struct super_block *sb, bool adjust_tz, bool symlink) + struct super_block *sb) { struct smb311_posix_qinfo *info = &data->posix_fi; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); @@ -655,7 +681,7 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_ope fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime); fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); - if (adjust_tz) { + if (data->adjust_tz) { fattr->cf_ctime.tv_sec += tcon->ses->server->timeAdj; fattr->cf_mtime.tv_sec += tcon->ses->server->timeAdj; } @@ -669,7 +695,7 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_ope /* The srv fs device id is overridden on network mount so setting rdev isn't needed here */ /* fattr->cf_rdev = le32_to_cpu(info->DeviceId); */ - if (symlink) { + if (data->symlink) { fattr->cf_mode |= S_IFLNK; fattr->cf_dtype = DT_LNK; fattr->cf_symlink_target = data->symlink_target; @@ -690,9 +716,46 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_ope fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink); } -static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data, - struct super_block *sb, bool adjust_tz, bool symlink, - u32 reparse_tag) +bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, + struct cifs_fattr *fattr, + u32 tag) +{ + switch (tag) { + case IO_REPARSE_TAG_LX_SYMLINK: + fattr->cf_mode |= S_IFLNK | cifs_sb->ctx->file_mode; + fattr->cf_dtype = DT_LNK; + break; + case IO_REPARSE_TAG_LX_FIFO: + fattr->cf_mode |= S_IFIFO | cifs_sb->ctx->file_mode; + fattr->cf_dtype = DT_FIFO; + break; + case IO_REPARSE_TAG_AF_UNIX: + fattr->cf_mode |= S_IFSOCK | cifs_sb->ctx->file_mode; + fattr->cf_dtype = DT_SOCK; + break; + case IO_REPARSE_TAG_LX_CHR: + fattr->cf_mode |= S_IFCHR | cifs_sb->ctx->file_mode; + fattr->cf_dtype = DT_CHR; + break; + case IO_REPARSE_TAG_LX_BLK: + fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode; + fattr->cf_dtype = DT_BLK; + break; + case 0: /* SMB1 symlink */ + case IO_REPARSE_TAG_SYMLINK: + case IO_REPARSE_TAG_NFS: + fattr->cf_mode = S_IFLNK; + fattr->cf_dtype = DT_LNK; + break; + default: + return false; + } + return true; +} + +static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, + struct cifs_open_info_data *data, + struct super_block *sb) { struct smb2_file_all_info *info = &data->fi; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); @@ -711,7 +774,7 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_i fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime); fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); - if (adjust_tz) { + if (data->adjust_tz) { fattr->cf_ctime.tv_sec += tcon->ses->server->timeAdj; fattr->cf_mtime.tv_sec += tcon->ses->server->timeAdj; } @@ -719,28 +782,13 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_i fattr->cf_eof = le64_to_cpu(info->EndOfFile); fattr->cf_bytes = le64_to_cpu(info->AllocationSize); fattr->cf_createtime = le64_to_cpu(info->CreationTime); - fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); - if (reparse_tag == IO_REPARSE_TAG_LX_SYMLINK) { - fattr->cf_mode |= S_IFLNK | cifs_sb->ctx->file_mode; - fattr->cf_dtype = DT_LNK; - } else if (reparse_tag == IO_REPARSE_TAG_LX_FIFO) { - fattr->cf_mode |= S_IFIFO | cifs_sb->ctx->file_mode; - fattr->cf_dtype = DT_FIFO; - } else if (reparse_tag == IO_REPARSE_TAG_AF_UNIX) { - fattr->cf_mode |= S_IFSOCK | cifs_sb->ctx->file_mode; - fattr->cf_dtype = DT_SOCK; - } else if (reparse_tag == IO_REPARSE_TAG_LX_CHR) { - fattr->cf_mode |= S_IFCHR | cifs_sb->ctx->file_mode; - fattr->cf_dtype = DT_CHR; - } else if (reparse_tag == IO_REPARSE_TAG_LX_BLK) { - fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode; - fattr->cf_dtype = DT_BLK; - } else if (symlink || reparse_tag == IO_REPARSE_TAG_SYMLINK || - reparse_tag == IO_REPARSE_TAG_NFS) { - fattr->cf_mode = S_IFLNK; - fattr->cf_dtype = DT_LNK; - } else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { + + if (cifs_open_data_reparse(data) && + cifs_reparse_point_to_fattr(cifs_sb, fattr, data->reparse_tag)) + goto out_reparse; + + if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { fattr->cf_mode = S_IFDIR | cifs_sb->ctx->dir_mode; fattr->cf_dtype = DT_DIR; /* @@ -769,6 +817,7 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_i } } +out_reparse: if (S_ISLNK(fattr->cf_mode)) { fattr->cf_symlink_target = data->symlink_target; data->symlink_target = NULL; @@ -789,8 +838,6 @@ cifs_get_file_info(struct file *filp) struct cifsFileInfo *cfile = filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; - bool symlink = false; - u32 tag = 0; if (!server->ops->query_file_info) return -ENOSYS; @@ -800,14 +847,15 @@ cifs_get_file_info(struct file *filp) switch (rc) { case 0: /* TODO: add support to query reparse tag */ + data.adjust_tz = false; if (data.symlink_target) { - symlink = true; - tag = IO_REPARSE_TAG_SYMLINK; + data.symlink = true; + data.reparse_tag = IO_REPARSE_TAG_SYMLINK; } - cifs_open_info_to_fattr(&fattr, &data, inode->i_sb, false, symlink, tag); + cifs_open_info_to_fattr(&fattr, &data, inode->i_sb); break; case -EREMOTE: - cifs_create_dfs_fattr(&fattr, inode->i_sb); + cifs_create_junction_fattr(&fattr, inode->i_sb); rc = 0; break; case -EOPNOTSUPP: @@ -960,22 +1008,66 @@ static inline bool is_inode_cache_good(struct inode *ino) return ino && CIFS_CACHE_READ(CIFS_I(ino)) && CIFS_I(ino)->time != 0; } -int cifs_get_inode_info(struct inode **inode, const char *full_path, - struct cifs_open_info_data *data, struct super_block *sb, int xid, - const struct cifs_fid *fid) +static int reparse_info_to_fattr(struct cifs_open_info_data *data, + struct super_block *sb, + const unsigned int xid, + struct cifs_tcon *tcon, + const char *full_path, + struct cifs_fattr *fattr) +{ + struct TCP_Server_Info *server = tcon->ses->server; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct kvec rsp_iov, *iov = NULL; + int rsp_buftype = CIFS_NO_BUFFER; + u32 tag = data->reparse_tag; + int rc = 0; + + if (!tag && server->ops->query_reparse_point) { + rc = server->ops->query_reparse_point(xid, tcon, cifs_sb, + full_path, &tag, + &rsp_iov, &rsp_buftype); + if (!rc) + iov = &rsp_iov; + } + switch ((data->reparse_tag = tag)) { + case 0: /* SMB1 symlink */ + iov = NULL; + fallthrough; + case IO_REPARSE_TAG_NFS: + case IO_REPARSE_TAG_SYMLINK: + if (!data->symlink_target && server->ops->query_symlink) { + rc = server->ops->query_symlink(xid, tcon, + cifs_sb, full_path, + &data->symlink_target, + iov); + } + break; + case IO_REPARSE_TAG_MOUNT_POINT: + cifs_create_junction_fattr(fattr, sb); + goto out; + } + + cifs_open_info_to_fattr(fattr, data, sb); +out: + free_rsp_buf(rsp_buftype, rsp_iov.iov_base); + return rc; +} + +static int cifs_get_fattr(struct cifs_open_info_data *data, + struct super_block *sb, int xid, + const struct cifs_fid *fid, + struct cifs_fattr *fattr, + struct inode **inode, + const char *full_path) { + struct cifs_open_info_data tmp_data = {}; struct cifs_tcon *tcon; struct TCP_Server_Info *server; struct tcon_link *tlink; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - bool adjust_tz = false; - struct cifs_fattr fattr = {0}; - bool is_reparse_point = false; - struct cifs_open_info_data tmp_data = {}; void *smb1_backup_rsp_buf = NULL; int rc = 0; int tmprc = 0; - __u32 reparse_tag = 0; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -988,12 +1080,8 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path, */ if (!data) { - if (is_inode_cache_good(*inode)) { - cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); - goto out; - } - rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path, &tmp_data, - &adjust_tz, &is_reparse_point); + rc = server->ops->query_path_info(xid, tcon, cifs_sb, + full_path, &tmp_data); data = &tmp_data; } @@ -1008,28 +1096,16 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path, * since we have to check if its reparse tag matches a known * special file type e.g. symlink or fifo or char etc. */ - if (is_reparse_point && data->symlink_target) { - reparse_tag = IO_REPARSE_TAG_SYMLINK; - } else if ((le32_to_cpu(data->fi.Attributes) & ATTR_REPARSE) && - server->ops->query_reparse_tag) { - tmprc = server->ops->query_reparse_tag(xid, tcon, cifs_sb, full_path, - &reparse_tag); - if (tmprc) - cifs_dbg(FYI, "%s: query_reparse_tag: rc = %d\n", __func__, tmprc); - if (server->ops->query_symlink) { - tmprc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path, - &data->symlink_target, - is_reparse_point); - if (tmprc) - cifs_dbg(FYI, "%s: query_symlink: rc = %d\n", __func__, - tmprc); - } + if (cifs_open_data_reparse(data)) { + rc = reparse_info_to_fattr(data, sb, xid, tcon, + full_path, fattr); + } else { + cifs_open_info_to_fattr(fattr, data, sb); } - cifs_open_info_to_fattr(&fattr, data, sb, adjust_tz, is_reparse_point, reparse_tag); break; case -EREMOTE: /* DFS link, no metadata available on this server */ - cifs_create_dfs_fattr(&fattr, sb); + cifs_create_junction_fattr(fattr, sb); rc = 0; break; case -EACCES: @@ -1059,8 +1135,8 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path, fdi = (FILE_DIRECTORY_INFO *)fi; si = (SEARCH_ID_FULL_DIR_INFO *)fi; - cifs_dir_info_to_fattr(&fattr, fdi, cifs_sb); - fattr.cf_uniqueid = le64_to_cpu(si->UniqueId); + cifs_dir_info_to_fattr(fattr, fdi, cifs_sb); + fattr->cf_uniqueid = le64_to_cpu(si->UniqueId); /* uniqueid set, skip get inum step */ goto handle_mnt_opt; } else { @@ -1077,10 +1153,10 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path, } /* - * 3. Get or update inode number (fattr.cf_uniqueid) + * 3. Get or update inode number (fattr->cf_uniqueid) */ - cifs_set_fattr_ino(xid, tcon, sb, inode, full_path, data, &fattr); + cifs_set_fattr_ino(xid, tcon, sb, inode, full_path, data, fattr); /* * 4. Tweak fattr based on mount options @@ -1089,17 +1165,17 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path, handle_mnt_opt: #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ /* query for SFU type info if supported and needed */ - if (fattr.cf_cifsattrs & ATTR_SYSTEM && - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { - tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid); + if ((fattr->cf_cifsattrs & ATTR_SYSTEM) && + (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) { + tmprc = cifs_sfu_type(fattr, full_path, cifs_sb, xid); if (tmprc) cifs_dbg(FYI, "cifs_sfu_type failed: %d\n", tmprc); } /* fill in 0777 bits from ACL */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) { - rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, true, - full_path, fid); + rc = cifs_acl_to_fattr(cifs_sb, fattr, *inode, + true, full_path, fid); if (rc == -EREMOTE) rc = 0; if (rc) { @@ -1108,8 +1184,8 @@ handle_mnt_opt: goto out; } } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { - rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, false, - full_path, fid); + rc = cifs_acl_to_fattr(cifs_sb, fattr, *inode, + false, full_path, fid); if (rc == -EREMOTE) rc = 0; if (rc) { @@ -1121,60 +1197,57 @@ handle_mnt_opt: /* fill in remaining high mode bits e.g. SUID, VTX */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) - cifs_sfu_mode(&fattr, full_path, cifs_sb, xid); + cifs_sfu_mode(fattr, full_path, cifs_sb, xid); /* check for Minshall+French symlinks */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { - tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr, - full_path); - if (tmprc) - cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); + tmprc = check_mf_symlink(xid, tcon, cifs_sb, fattr, full_path); + cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); } - /* - * 5. Update inode with final fattr data - */ - - if (!*inode) { - *inode = cifs_iget(sb, &fattr); - if (!*inode) - rc = -ENOMEM; - } else { - /* we already have inode, update it */ - - /* if uniqueid is different, return error */ - if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM && - CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) { - CIFS_I(*inode)->time = 0; /* force reval */ - rc = -ESTALE; - goto out; - } - /* if filetype is different, return error */ - rc = cifs_fattr_to_inode(*inode, &fattr); - } out: cifs_buf_release(smb1_backup_rsp_buf); cifs_put_tlink(tlink); cifs_free_open_info(&tmp_data); + return rc; +} + +int cifs_get_inode_info(struct inode **inode, + const char *full_path, + struct cifs_open_info_data *data, + struct super_block *sb, int xid, + const struct cifs_fid *fid) +{ + struct cifs_fattr fattr = {}; + int rc; + + if (is_inode_cache_good(*inode)) { + cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); + return 0; + } + + rc = cifs_get_fattr(data, sb, xid, fid, &fattr, inode, full_path); + if (rc) + goto out; + + rc = update_inode_info(sb, &fattr, inode); +out: kfree(fattr.cf_symlink_target); return rc; } -int -smb311_posix_get_inode_info(struct inode **inode, - const char *full_path, - struct super_block *sb, unsigned int xid) +static int smb311_posix_get_fattr(struct cifs_fattr *fattr, + const char *full_path, + struct super_block *sb, + const unsigned int xid) { + struct cifs_open_info_data data = {}; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon; struct tcon_link *tlink; - struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - bool adjust_tz = false; - struct cifs_fattr fattr = {0}; - bool symlink = false; - struct cifs_open_info_data data = {}; struct cifs_sid owner, group; - int rc = 0; - int tmprc = 0; + int tmprc; + int rc; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -1185,14 +1258,9 @@ smb311_posix_get_inode_info(struct inode **inode, * 1. Fetch file metadata */ - if (is_inode_cache_good(*inode)) { - cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); - goto out; - } - - rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, full_path, &data, - &owner, &group, &adjust_tz, - &symlink); + rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, + full_path, &data, + &owner, &group); /* * 2. Convert it to internal cifs metadata (fattr) @@ -1200,12 +1268,11 @@ smb311_posix_get_inode_info(struct inode **inode, switch (rc) { case 0: - smb311_posix_info_to_fattr(&fattr, &data, &owner, &group, - sb, adjust_tz, symlink); + smb311_posix_info_to_fattr(fattr, &data, &owner, &group, sb); break; case -EREMOTE: /* DFS link, no metadata available on this server */ - cifs_create_dfs_fattr(&fattr, sb); + cifs_create_junction_fattr(fattr, sb); rc = 0; break; case -EACCES: @@ -1221,49 +1288,42 @@ smb311_posix_get_inode_info(struct inode **inode, goto out; } - /* * 3. Tweak fattr based on mount options */ - /* check for Minshall+French symlinks */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { - tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr, - full_path); - if (tmprc) - cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); + tmprc = check_mf_symlink(xid, tcon, cifs_sb, fattr, full_path); + cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); } - /* - * 4. Update inode with final fattr data - */ - - if (!*inode) { - *inode = cifs_iget(sb, &fattr); - if (!*inode) - rc = -ENOMEM; - } else { - /* we already have inode, update it */ +out: + cifs_put_tlink(tlink); + cifs_free_open_info(&data); + return rc; +} - /* if uniqueid is different, return error */ - if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM && - CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) { - CIFS_I(*inode)->time = 0; /* force reval */ - rc = -ESTALE; - goto out; - } +int smb311_posix_get_inode_info(struct inode **inode, const char *full_path, + struct super_block *sb, const unsigned int xid) +{ + struct cifs_fattr fattr = {}; + int rc; - /* if filetype is different, return error */ - rc = cifs_fattr_to_inode(*inode, &fattr); + if (is_inode_cache_good(*inode)) { + cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); + return 0; } + + rc = smb311_posix_get_fattr(&fattr, full_path, sb, xid); + if (rc) + goto out; + + rc = update_inode_info(sb, &fattr, inode); out: - cifs_put_tlink(tlink); - cifs_free_open_info(&data); kfree(fattr.cf_symlink_target); return rc; } - static const struct inode_operations cifs_ipc_inode_ops = { .lookup = cifs_lookup, }; @@ -1367,13 +1427,14 @@ retry_iget5_locked: /* gets root inode */ struct inode *cifs_root_iget(struct super_block *sb) { - unsigned int xid; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - struct inode *inode = NULL; - long rc; + struct cifs_fattr fattr = {}; struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + struct inode *inode = NULL; + unsigned int xid; char *path = NULL; int len; + int rc; if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) && cifs_sb->prepath) { @@ -1391,21 +1452,29 @@ struct inode *cifs_root_iget(struct super_block *sb) xid = get_xid(); if (tcon->unix_ext) { - rc = cifs_get_inode_info_unix(&inode, path, sb, xid); + rc = cifs_get_unix_fattr(path, sb, &fattr, &inode, xid); /* some servers mistakenly claim POSIX support */ if (rc != -EOPNOTSUPP) - goto iget_no_retry; + goto iget_root; cifs_dbg(VFS, "server does not support POSIX extensions\n"); tcon->unix_ext = false; } convert_delimiter(path, CIFS_DIR_SEP(cifs_sb)); if (tcon->posix_extensions) - rc = smb311_posix_get_inode_info(&inode, path, sb, xid); + rc = smb311_posix_get_fattr(&fattr, path, sb, xid); else - rc = cifs_get_inode_info(&inode, path, NULL, sb, xid, NULL); + rc = cifs_get_fattr(NULL, sb, xid, NULL, &fattr, &inode, path); + +iget_root: + if (!rc) { + if (fattr.cf_flags & CIFS_FATTR_JUNCTION) { + fattr.cf_flags &= ~CIFS_FATTR_JUNCTION; + cifs_autodisable_serverino(cifs_sb); + } + inode = cifs_iget(sb, &fattr); + } -iget_no_retry: if (!inode) { inode = ERR_PTR(rc); goto out; @@ -1429,6 +1498,7 @@ iget_no_retry: out: kfree(path); free_xid(xid); + kfree(fattr.cf_symlink_target); return inode; } @@ -1744,9 +1814,9 @@ out_reval: cifs_inode = CIFS_I(inode); cifs_inode->time = 0; /* will force revalidate to get info when needed */ - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); } - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); cifs_inode = CIFS_I(dir); CIFS_I(dir)->time = 0; /* force revalidate of dir as well */ unlink_out: @@ -2060,8 +2130,8 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) */ cifsInode->time = 0; - d_inode(direntry)->i_ctime = inode->i_ctime = inode->i_mtime = - current_time(inode); + inode_set_ctime_current(d_inode(direntry)); + inode->i_mtime = inode_set_ctime_current(inode); rmdir_exit: free_dentry_path(page); @@ -2267,8 +2337,8 @@ unlink_target: /* force revalidate to go get info when needed */ CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0; - source_dir->i_ctime = source_dir->i_mtime = target_dir->i_ctime = - target_dir->i_mtime = current_time(source_dir); + source_dir->i_mtime = target_dir->i_mtime = inode_set_ctime_to_ts(source_dir, + inode_set_ctime_current(target_dir)); cifs_rename_exit: kfree(info_buf_source); @@ -2540,7 +2610,7 @@ int cifs_getattr(struct mnt_idmap *idmap, const struct path *path, return rc; } - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->blksize = cifs_sb->ctx->bsize; stat->ino = CIFS_I(inode)->uniqueid; @@ -2610,7 +2680,7 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start, } cifsFileInfo_put(cfile); - return -ENOTSUPP; + return -EOPNOTSUPP; } int cifs_truncate_page(struct address_space *mapping, loff_t from) diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index d7e85d9a2655..35b176457bbe 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -113,18 +113,22 @@ sesInfoFree(struct cifs_ses *buf_to_free) } struct cifs_tcon * -tconInfoAlloc(void) +tcon_info_alloc(bool dir_leases_enabled) { struct cifs_tcon *ret_buf; ret_buf = kzalloc(sizeof(*ret_buf), GFP_KERNEL); if (!ret_buf) return NULL; - ret_buf->cfids = init_cached_dirs(); - if (!ret_buf->cfids) { - kfree(ret_buf); - return NULL; + + if (dir_leases_enabled == true) { + ret_buf->cfids = init_cached_dirs(); + if (!ret_buf->cfids) { + kfree(ret_buf); + return NULL; + } } + /* else ret_buf->cfids is already set to NULL above */ atomic_inc(&tconInfoAllocCount); ret_buf->status = TID_NEW; @@ -476,7 +480,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) return false; /* If server is a channel, select the primary channel */ - pserver = CIFS_SERVER_IS_CHAN(srv) ? srv->primary_server : srv; + pserver = SERVER_IS_CHAN(srv) ? srv->primary_server : srv; /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); diff --git a/fs/smb/client/cifs_dfs_ref.c b/fs/smb/client/namespace.c index b1c2499b1c3b..c8f5ed8a69f1 100644 --- a/fs/smb/client/cifs_dfs_ref.c +++ b/fs/smb/client/namespace.c @@ -1,12 +1,12 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * Contains the CIFS DFS referral mounting routines used for handling - * traversal via DFS junction point + * Contains mounting routines used for handling traversal via SMB junctions. * * Copyright (c) 2007 Igor Mammedov * Copyright (C) International Business Machines Corp., 2008 * Author(s): Igor Mammedov (niallain@gmail.com) * Steve French (sfrench@us.ibm.com) + * Copyright (c) 2023 Paulo Alcantara <palcantara@suse.de> */ #include <linux/dcache.h> @@ -19,32 +19,31 @@ #include "cifsglob.h" #include "cifsproto.h" #include "cifsfs.h" -#include "dns_resolve.h" #include "cifs_debug.h" -#include "dfs.h" #include "fs_context.h" -static LIST_HEAD(cifs_dfs_automount_list); +static LIST_HEAD(cifs_automount_list); -static void cifs_dfs_expire_automounts(struct work_struct *work); -static DECLARE_DELAYED_WORK(cifs_dfs_automount_task, - cifs_dfs_expire_automounts); -static int cifs_dfs_mountpoint_expiry_timeout = 500 * HZ; +static void cifs_expire_automounts(struct work_struct *work); +static DECLARE_DELAYED_WORK(cifs_automount_task, + cifs_expire_automounts); +static int cifs_mountpoint_expiry_timeout = 500 * HZ; -static void cifs_dfs_expire_automounts(struct work_struct *work) +static void cifs_expire_automounts(struct work_struct *work) { - struct list_head *list = &cifs_dfs_automount_list; + struct list_head *list = &cifs_automount_list; mark_mounts_for_expiry(list); if (!list_empty(list)) - schedule_delayed_work(&cifs_dfs_automount_task, - cifs_dfs_mountpoint_expiry_timeout); + schedule_delayed_work(&cifs_automount_task, + cifs_mountpoint_expiry_timeout); } -void cifs_dfs_release_automount_timer(void) +void cifs_release_automount_timer(void) { - BUG_ON(!list_empty(&cifs_dfs_automount_list)); - cancel_delayed_work_sync(&cifs_dfs_automount_task); + if (WARN_ON(!list_empty(&cifs_automount_list))) + return; + cancel_delayed_work_sync(&cifs_automount_task); } /** @@ -118,26 +117,53 @@ cifs_build_devname(char *nodename, const char *prepath) return dev; } -static int set_dest_addr(struct smb3_fs_context *ctx) +/* Return full path out of a dentry set for automount */ +static char *automount_fullpath(struct dentry *dentry, void *page) { - struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr; - int rc; + struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + size_t len; + char *s; + + spin_lock(&tcon->tc_lock); + if (!tcon->origin_fullpath) { + spin_unlock(&tcon->tc_lock); + return build_path_from_dentry_optional_prefix(dentry, + page, + true); + } + spin_unlock(&tcon->tc_lock); + + s = dentry_path_raw(dentry, page, PATH_MAX); + if (IS_ERR(s)) + return s; + /* for root, we want "" */ + if (!s[1]) + s++; + + spin_lock(&tcon->tc_lock); + len = strlen(tcon->origin_fullpath); + if (s < (char *)page + len) { + spin_unlock(&tcon->tc_lock); + return ERR_PTR(-ENAMETOOLONG); + } + + s -= len; + memcpy(s, tcon->origin_fullpath, len); + spin_unlock(&tcon->tc_lock); + convert_delimiter(s, '/'); - rc = dns_resolve_server_name_to_ip(ctx->source, addr, NULL); - if (!rc) - cifs_set_port(addr, ctx->port); - return rc; + return s; } /* * Create a vfsmount that we can automount */ -static struct vfsmount *cifs_dfs_do_automount(struct path *path) +static struct vfsmount *cifs_do_automount(struct path *path) { int rc; struct dentry *mntpt = path->dentry; struct fs_context *fc; - struct cifs_sb_info *cifs_sb; void *page = NULL; struct smb3_fs_context *ctx, *cur_ctx; struct smb3_fs_context tmp; @@ -147,17 +173,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct path *path) if (IS_ROOT(mntpt)) return ERR_PTR(-ESTALE); - /* - * The MSDFS spec states that paths in DFS referral requests and - * responses must be prefixed by a single '\' character instead of - * the double backslashes usually used in the UNC. This function - * gives us the latter, so we must adjust the result. - */ - cifs_sb = CIFS_SB(mntpt->d_sb); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) - return ERR_PTR(-EREMOTE); - - cur_ctx = cifs_sb->ctx; + cur_ctx = CIFS_SB(mntpt->d_sb)->ctx; fc = fs_context_for_submount(path->mnt->mnt_sb->s_type, mntpt); if (IS_ERR(fc)) @@ -166,7 +182,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct path *path) ctx = smb3_fc2context(fc); page = alloc_dentry_path(); - full_path = dfs_get_automount_devname(mntpt, page); + full_path = automount_fullpath(mntpt, page); if (IS_ERR(full_path)) { mnt = ERR_CAST(full_path); goto out; @@ -196,15 +212,10 @@ static struct vfsmount *cifs_dfs_do_automount(struct path *path) ctx->source = NULL; goto out; } - cifs_dbg(FYI, "%s: ctx: source=%s UNC=%s prepath=%s dstaddr=%pISpc\n", - __func__, ctx->source, ctx->UNC, ctx->prepath, &ctx->dstaddr); - - rc = set_dest_addr(ctx); - if (!rc) - mnt = fc_mount(fc); - else - mnt = ERR_PTR(rc); + cifs_dbg(FYI, "%s: ctx: source=%s UNC=%s prepath=%s\n", + __func__, ctx->source, ctx->UNC, ctx->prepath); + mnt = fc_mount(fc); out: put_fs_context(fc); free_dentry_path(page); @@ -214,25 +225,25 @@ out: /* * Attempt to automount the referral */ -struct vfsmount *cifs_dfs_d_automount(struct path *path) +struct vfsmount *cifs_d_automount(struct path *path) { struct vfsmount *newmnt; cifs_dbg(FYI, "%s: %pd\n", __func__, path->dentry); - newmnt = cifs_dfs_do_automount(path); + newmnt = cifs_do_automount(path); if (IS_ERR(newmnt)) { cifs_dbg(FYI, "leaving %s [automount failed]\n" , __func__); return newmnt; } mntget(newmnt); /* prevent immediate expiration */ - mnt_set_expiry(newmnt, &cifs_dfs_automount_list); - schedule_delayed_work(&cifs_dfs_automount_task, - cifs_dfs_mountpoint_expiry_timeout); + mnt_set_expiry(newmnt, &cifs_automount_list); + schedule_delayed_work(&cifs_automount_task, + cifs_mountpoint_expiry_timeout); cifs_dbg(FYI, "leaving %s [ok]\n" , __func__); return newmnt; } -const struct inode_operations cifs_dfs_referral_inode_operations = { +const struct inode_operations cifs_namespace_inode_operations = { }; diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index ef638086d734..47fc22de8d20 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -143,6 +143,7 @@ static bool reparse_file_needs_reval(const struct cifs_fattr *fattr) case IO_REPARSE_TAG_DFSR: case IO_REPARSE_TAG_SYMLINK: case IO_REPARSE_TAG_NFS: + case IO_REPARSE_TAG_MOUNT_POINT: case 0: return true; } @@ -163,29 +164,19 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) * TODO: go through all documented reparse tags to see if we can * reasonably map some of them to directories vs. files vs. symlinks */ + if ((fattr->cf_cifsattrs & ATTR_REPARSE) && + cifs_reparse_point_to_fattr(cifs_sb, fattr, fattr->cf_cifstag)) + goto out_reparse; + if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { fattr->cf_mode = S_IFDIR | cifs_sb->ctx->dir_mode; fattr->cf_dtype = DT_DIR; - } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_SYMLINK) { - fattr->cf_mode |= S_IFLNK | cifs_sb->ctx->file_mode; - fattr->cf_dtype = DT_LNK; - } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_FIFO) { - fattr->cf_mode |= S_IFIFO | cifs_sb->ctx->file_mode; - fattr->cf_dtype = DT_FIFO; - } else if (fattr->cf_cifstag == IO_REPARSE_TAG_AF_UNIX) { - fattr->cf_mode |= S_IFSOCK | cifs_sb->ctx->file_mode; - fattr->cf_dtype = DT_SOCK; - } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_CHR) { - fattr->cf_mode |= S_IFCHR | cifs_sb->ctx->file_mode; - fattr->cf_dtype = DT_CHR; - } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_BLK) { - fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode; - fattr->cf_dtype = DT_BLK; - } else { /* TODO: should we mark some other reparse points (like DFSR) as directories? */ + } else { fattr->cf_mode = S_IFREG | cifs_sb->ctx->file_mode; fattr->cf_dtype = DT_REG; } +out_reparse: /* * We need to revalidate it further to make a decision about whether it * is a symbolic link, DFS referral or a reparse point with a direct diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index c57ca2050b73..79f26c560edf 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -323,12 +323,12 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) ses->chans[chan_index].iface = iface; /* No iface is found. if secondary chan, drop connection */ - if (!iface && CIFS_SERVER_IS_CHAN(server)) + if (!iface && SERVER_IS_CHAN(server)) ses->chans[chan_index].server = NULL; spin_unlock(&ses->chan_lock); - if (!iface && CIFS_SERVER_IS_CHAN(server)) + if (!iface && SERVER_IS_CHAN(server)) cifs_put_tcp_session(server, false); return rc; @@ -360,11 +360,11 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, { struct TCP_Server_Info *chan_server; struct cifs_chan *chan; - struct smb3_fs_context ctx = {NULL}; + struct smb3_fs_context *ctx; static const char unc_fmt[] = "\\%s\\foo"; - char unc[sizeof(unc_fmt)+SERVER_NAME_LEN_WITH_NULL] = {0}; struct sockaddr_in *ipv4 = (struct sockaddr_in *)&iface->sockaddr; struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)&iface->sockaddr; + size_t len; int rc; unsigned int xid = get_xid(); @@ -388,54 +388,64 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, * the session and server without caring about memory * management. */ + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) { + rc = -ENOMEM; + goto out_free_xid; + } /* Always make new connection for now (TODO?) */ - ctx.nosharesock = true; + ctx->nosharesock = true; /* Auth */ - ctx.domainauto = ses->domainAuto; - ctx.domainname = ses->domainName; + ctx->domainauto = ses->domainAuto; + ctx->domainname = ses->domainName; /* no hostname for extra channels */ - ctx.server_hostname = ""; + ctx->server_hostname = ""; - ctx.username = ses->user_name; - ctx.password = ses->password; - ctx.sectype = ses->sectype; - ctx.sign = ses->sign; + ctx->username = ses->user_name; + ctx->password = ses->password; + ctx->sectype = ses->sectype; + ctx->sign = ses->sign; /* UNC and paths */ /* XXX: Use ses->server->hostname? */ - sprintf(unc, unc_fmt, ses->ip_addr); - ctx.UNC = unc; - ctx.prepath = ""; + len = sizeof(unc_fmt) + SERVER_NAME_LEN_WITH_NULL; + ctx->UNC = kzalloc(len, GFP_KERNEL); + if (!ctx->UNC) { + rc = -ENOMEM; + goto out_free_ctx; + } + scnprintf(ctx->UNC, len, unc_fmt, ses->ip_addr); + ctx->prepath = ""; /* Reuse same version as master connection */ - ctx.vals = ses->server->vals; - ctx.ops = ses->server->ops; + ctx->vals = ses->server->vals; + ctx->ops = ses->server->ops; - ctx.noblocksnd = ses->server->noblocksnd; - ctx.noautotune = ses->server->noautotune; - ctx.sockopt_tcp_nodelay = ses->server->tcp_nodelay; - ctx.echo_interval = ses->server->echo_interval / HZ; - ctx.max_credits = ses->server->max_credits; + ctx->noblocksnd = ses->server->noblocksnd; + ctx->noautotune = ses->server->noautotune; + ctx->sockopt_tcp_nodelay = ses->server->tcp_nodelay; + ctx->echo_interval = ses->server->echo_interval / HZ; + ctx->max_credits = ses->server->max_credits; /* * This will be used for encoding/decoding user/domain/pw * during sess setup auth. */ - ctx.local_nls = cifs_sb->local_nls; + ctx->local_nls = cifs_sb->local_nls; /* Use RDMA if possible */ - ctx.rdma = iface->rdma_capable; - memcpy(&ctx.dstaddr, &iface->sockaddr, sizeof(struct sockaddr_storage)); + ctx->rdma = iface->rdma_capable; + memcpy(&ctx->dstaddr, &iface->sockaddr, sizeof(ctx->dstaddr)); /* reuse master con client guid */ - memcpy(&ctx.client_guid, ses->server->client_guid, - SMB2_CLIENT_GUID_SIZE); - ctx.use_client_guid = true; + memcpy(&ctx->client_guid, ses->server->client_guid, + sizeof(ctx->client_guid)); + ctx->use_client_guid = true; - chan_server = cifs_get_tcp_session(&ctx, ses->server); + chan_server = cifs_get_tcp_session(ctx, ses->server); spin_lock(&ses->chan_lock); chan = &ses->chans[ses->chan_count]; @@ -497,6 +507,10 @@ out: cifs_put_tcp_session(chan->server, 0); } + kfree(ctx->UNC); +out_free_ctx: + kfree(ctx); +out_free_xid: free_xid(xid); return rc; } diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index 7d1b3fc014d9..9bf8735cdd1e 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -542,14 +542,17 @@ cifs_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, return rc; } -static int cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - struct cifs_open_info_data *data, bool *adjustTZ, bool *symlink) +static int cifs_query_path_info(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + struct cifs_open_info_data *data) { int rc; FILE_ALL_INFO fi = {}; - *symlink = false; + data->symlink = false; + data->adjust_tz = false; /* could do find first instead but this returns more info */ rc = CIFSSMBQPathInfo(xid, tcon, full_path, &fi, 0 /* not legacy */, cifs_sb->local_nls, @@ -562,7 +565,7 @@ static int cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) { rc = SMBQueryInformation(xid, tcon, full_path, &fi, cifs_sb->local_nls, cifs_remap(cifs_sb)); - *adjustTZ = true; + data->adjust_tz = true; } if (!rc) { @@ -589,7 +592,7 @@ static int cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, /* Need to check if this is a symbolic link or not */ tmprc = CIFS_open(xid, &oparms, &oplock, NULL); if (tmprc == -EOPNOTSUPP) - *symlink = true; + data->symlink = true; else if (tmprc == 0) CIFSSMBClose(xid, tcon, fid.netfid); } @@ -969,13 +972,16 @@ cifs_unix_dfs_readlink(const unsigned int xid, struct cifs_tcon *tcon, #endif } -static int -cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - char **target_path, bool is_reparse_point) +static int cifs_query_symlink(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + char **target_path, + struct kvec *rsp_iov) { int rc; int oplock = 0; + bool is_reparse_point = !!rsp_iov; struct cifs_fid fid; struct cifs_open_parms oparms; diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 8e696fbd72fa..0b89f7008ac0 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -35,34 +35,22 @@ free_set_inf_compound(struct smb_rqst *rqst) SMB2_close_free(&rqst[2]); } - -struct cop_vars { - struct cifs_open_parms oparms; - struct kvec rsp_iov[3]; - struct smb_rqst rqst[3]; - struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; - struct kvec qi_iov[1]; - struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; - struct kvec close_iov[1]; - struct smb2_file_rename_info rename_info; - struct smb2_file_link_info link_info; -}; - /* * note: If cfile is passed, the reference to it is dropped here. * So make sure that you do not reuse cfile after return from this func. * - * If passing @err_iov and @err_buftype, ensure to make them both large enough (>= 3) to hold all - * error responses. Caller is also responsible for freeing them up. + * If passing @out_iov and @out_buftype, ensure to make them both large enough + * (>= 3) to hold all compounded responses. Caller is also responsible for + * freeing them up with free_rsp_buf(). */ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, __u32 desired_access, __u32 create_disposition, __u32 create_options, umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile, __u8 **extbuf, size_t *extbuflen, - struct kvec *err_iov, int *err_buftype) + struct kvec *out_iov, int *out_buftype) { - struct cop_vars *vars = NULL; + struct smb2_compound_vars *vars = NULL; struct kvec *rsp_iov; struct smb_rqst *rqst; int rc; @@ -133,7 +121,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, /* Operation */ switch (command) { case SMB2_OP_QUERY_INFO: - rqst[num_rqst].rq_iov = &vars->qi_iov[0]; + rqst[num_rqst].rq_iov = &vars->qi_iov; rqst[num_rqst].rq_nvec = 1; if (cfile) @@ -167,7 +155,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, full_path); break; case SMB2_OP_POSIX_QUERY_INFO: - rqst[num_rqst].rq_iov = &vars->qi_iov[0]; + rqst[num_rqst].rq_iov = &vars->qi_iov; rqst[num_rqst].rq_nvec = 1; if (cfile) @@ -375,7 +363,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, goto after_close; /* Close */ flags |= CIFS_CP_CREATE_CLOSE_OP; - rqst[num_rqst].rq_iov = &vars->close_iov[0]; + rqst[num_rqst].rq_iov = &vars->close_iov; rqst[num_rqst].rq_nvec = 1; rc = SMB2_close_init(tcon, server, &rqst[num_rqst], COMPOUND_FID, @@ -529,9 +517,9 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, if (cfile) cifsFileInfo_put(cfile); - if (rc && err_iov && err_buftype) { - memcpy(err_iov, rsp_iov, 3 * sizeof(*err_iov)); - memcpy(err_buftype, resp_buftype, 3 * sizeof(*err_buftype)); + if (out_iov && out_buftype) { + memcpy(out_iov, rsp_iov, 3 * sizeof(*out_iov)); + memcpy(out_buftype, resp_buftype, 3 * sizeof(*out_buftype)); } else { free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); @@ -541,20 +529,53 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, return rc; } -int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse) +static int parse_create_response(struct cifs_open_info_data *data, + struct cifs_sb_info *cifs_sb, + const struct kvec *iov) +{ + struct smb2_create_rsp *rsp = iov->iov_base; + bool reparse_point = false; + u32 tag = 0; + int rc = 0; + + switch (rsp->hdr.Status) { + case STATUS_IO_REPARSE_TAG_NOT_HANDLED: + reparse_point = true; + break; + case STATUS_STOPPED_ON_SYMLINK: + rc = smb2_parse_symlink_response(cifs_sb, iov, + &data->symlink_target); + if (rc) + return rc; + tag = IO_REPARSE_TAG_SYMLINK; + reparse_point = true; + break; + case STATUS_SUCCESS: + reparse_point = !!(rsp->Flags & SMB2_CREATE_FLAG_REPARSEPOINT); + break; + } + data->reparse_point = reparse_point; + data->reparse_tag = tag; + return rc; +} + +int smb2_query_path_info(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + struct cifs_open_info_data *data) { __u32 create_options = 0; struct cifsFileInfo *cfile; struct cached_fid *cfid = NULL; - struct kvec err_iov[3] = {}; - int err_buftype[3] = {}; + struct smb2_hdr *hdr; + struct kvec out_iov[3] = {}; + int out_buftype[3] = {}; bool islink; int rc, rc2; - *adjust_tz = false; - *reparse = false; + data->adjust_tz = false; + data->reparse_point = false; if (strcmp(full_path, "")) rc = -ENOENT; @@ -575,69 +596,73 @@ int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, ACL_NO_MODE, data, SMB2_OP_QUERY_INFO, cfile, - NULL, NULL, err_iov, err_buftype); - if (rc) { - struct smb2_hdr *hdr = err_iov[0].iov_base; - - if (unlikely(!hdr || err_buftype[0] == CIFS_NO_BUFFER)) + NULL, NULL, out_iov, out_buftype); + hdr = out_iov[0].iov_base; + /* + * If first iov is unset, then SMB session was dropped or we've got a + * cached open file (@cfile). + */ + if (!hdr || out_buftype[0] == CIFS_NO_BUFFER) + goto out; + + switch (rc) { + case 0: + case -EOPNOTSUPP: + rc = parse_create_response(data, cifs_sb, &out_iov[0]); + if (rc || !data->reparse_point) goto out; - if (rc == -EOPNOTSUPP && hdr->Command == SMB2_CREATE && - hdr->Status == STATUS_STOPPED_ON_SYMLINK) { - rc = smb2_parse_symlink_response(cifs_sb, err_iov, - &data->symlink_target); - if (rc) - goto out; - - *reparse = true; - create_options |= OPEN_REPARSE_POINT; - - /* Failed on a symbolic link - query a reparse point info */ - cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, data, - SMB2_OP_QUERY_INFO, cfile, NULL, NULL, - NULL, NULL); + + create_options |= OPEN_REPARSE_POINT; + /* Failed on a symbolic link - query a reparse point info */ + cifs_get_readable_path(tcon, full_path, &cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, + FILE_READ_ATTRIBUTES, FILE_OPEN, + create_options, ACL_NO_MODE, data, + SMB2_OP_QUERY_INFO, cfile, NULL, NULL, + NULL, NULL); + break; + case -EREMOTE: + break; + default: + if (hdr->Status != STATUS_OBJECT_NAME_INVALID) + break; + rc2 = cifs_inval_name_dfs_link_error(xid, tcon, cifs_sb, + full_path, &islink); + if (rc2) { + rc = rc2; goto out; - } else if (rc != -EREMOTE && hdr->Status == STATUS_OBJECT_NAME_INVALID) { - rc2 = cifs_inval_name_dfs_link_error(xid, tcon, cifs_sb, - full_path, &islink); - if (rc2) { - rc = rc2; - goto out; - } - if (islink) - rc = -EREMOTE; } + if (islink) + rc = -EREMOTE; } out: - free_rsp_buf(err_buftype[0], err_iov[0].iov_base); - free_rsp_buf(err_buftype[1], err_iov[1].iov_base); - free_rsp_buf(err_buftype[2], err_iov[2].iov_base); + free_rsp_buf(out_buftype[0], out_iov[0].iov_base); + free_rsp_buf(out_buftype[1], out_iov[1].iov_base); + free_rsp_buf(out_buftype[2], out_iov[2].iov_base); return rc; } - -int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, +int smb311_posix_query_path_info(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, struct cifs_open_info_data *data, struct cifs_sid *owner, - struct cifs_sid *group, - bool *adjust_tz, bool *reparse) + struct cifs_sid *group) { int rc; __u32 create_options = 0; struct cifsFileInfo *cfile; - struct kvec err_iov[3] = {}; - int err_buftype[3] = {}; + struct kvec out_iov[3] = {}; + int out_buftype[3] = {}; __u8 *sidsbuf = NULL; __u8 *sidsbuf_end = NULL; size_t sidsbuflen = 0; size_t owner_len, group_len; - *adjust_tz = false; - *reparse = false; + data->adjust_tz = false; + data->reparse_point = false; /* * BB TODO: Add support for using the cached root handle. @@ -649,27 +674,33 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, ACL_NO_MODE, data, SMB2_OP_POSIX_QUERY_INFO, cfile, - &sidsbuf, &sidsbuflen, err_iov, err_buftype); - if (rc == -EOPNOTSUPP) { + &sidsbuf, &sidsbuflen, out_iov, out_buftype); + /* + * If first iov is unset, then SMB session was dropped or we've got a + * cached open file (@cfile). + */ + if (!out_iov[0].iov_base || out_buftype[0] == CIFS_NO_BUFFER) + goto out; + + switch (rc) { + case 0: + case -EOPNOTSUPP: /* BB TODO: When support for special files added to Samba re-verify this path */ - if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER && - ((struct smb2_hdr *)err_iov[0].iov_base)->Command == SMB2_CREATE && - ((struct smb2_hdr *)err_iov[0].iov_base)->Status == STATUS_STOPPED_ON_SYMLINK) { - rc = smb2_parse_symlink_response(cifs_sb, err_iov, &data->symlink_target); - if (rc) - goto out; - } - *reparse = true; - create_options |= OPEN_REPARSE_POINT; + rc = parse_create_response(data, cifs_sb, &out_iov[0]); + if (rc || !data->reparse_point) + goto out; + create_options |= OPEN_REPARSE_POINT; /* Failed on a symbolic link - query a reparse point info */ cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, ACL_NO_MODE, data, SMB2_OP_POSIX_QUERY_INFO, cfile, &sidsbuf, &sidsbuflen, NULL, NULL); + break; } +out: if (rc == 0) { sidsbuf_end = sidsbuf + sidsbuflen; @@ -689,11 +720,10 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, memcpy(group, sidsbuf + owner_len, group_len); } -out: kfree(sidsbuf); - free_rsp_buf(err_buftype[0], err_iov[0].iov_base); - free_rsp_buf(err_buftype[1], err_iov[1].iov_base); - free_rsp_buf(err_buftype[2], err_iov[2].iov_base); + free_rsp_buf(out_buftype[0], out_iov[0].iov_base); + free_rsp_buf(out_buftype[1], out_iov[1].iov_base); + free_rsp_buf(out_buftype[2], out_iov[2].iov_base); return rc; } diff --git a/fs/smb/client/smb2maperror.c b/fs/smb/client/smb2maperror.c index 194799ddd382..1a90dd78b238 100644 --- a/fs/smb/client/smb2maperror.c +++ b/fs/smb/client/smb2maperror.c @@ -877,8 +877,6 @@ static const struct status_to_posix_error smb2_error_map_table[] = { "STATUS_IO_REPARSE_TAG_MISMATCH"}, {STATUS_IO_REPARSE_DATA_INVALID, -EIO, "STATUS_IO_REPARSE_DATA_INVALID"}, - {STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EIO, - "STATUS_IO_REPARSE_TAG_NOT_HANDLED"}, {STATUS_REPARSE_POINT_NOT_RESOLVED, -EIO, "STATUS_REPARSE_POINT_NOT_RESOLVED"}, {STATUS_DIRECTORY_IS_A_REPARSE_POINT, -EIO, diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c index 3935a60db5c3..25f7cd6f23d6 100644 --- a/fs/smb/client/smb2misc.c +++ b/fs/smb/client/smb2misc.c @@ -145,7 +145,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server) __u64 mid; /* If server is a channel, select the primary channel */ - pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; /* * Add function to do table lookup of StructureSize by command @@ -623,7 +623,7 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server) cifs_dbg(FYI, "Checking for lease break\n"); /* If server is a channel, select the primary channel */ - pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); @@ -698,7 +698,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) cifs_dbg(FYI, "oplock level 0x%x\n", rsp->OplockLevel); /* If server is a channel, select the primary channel */ - pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 0f62bc373ad0..9aeecee6b91b 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -172,8 +172,17 @@ smb2_set_credits(struct TCP_Server_Info *server, const int val) spin_lock(&server->req_lock); server->credits = val; - if (val == 1) + if (val == 1) { server->reconnect_instance++; + /* + * ChannelSequence updated for all channels in primary channel so that consistent + * across SMB3 requests sent on any channel. See MS-SMB2 3.2.4.1 and 3.2.7.1 + */ + if (SERVER_IS_CHAN(server)) + server->primary_server->channel_sequence_num++; + else + server->channel_sequence_num++; + } scredits = server->credits; in_flight = server->in_flight; spin_unlock(&server->req_lock); @@ -288,7 +297,7 @@ smb2_adjust_credits(struct TCP_Server_Info *server, cifs_server_dbg(VFS, "request has less credits (%d) than required (%d)", credits->value, new_val); - return -ENOTSUPP; + return -EOPNOTSUPP; } spin_lock(&server->req_lock); @@ -1075,31 +1084,28 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, return rc; } - static int smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, const char *path, const char *ea_name, const void *ea_value, const __u16 ea_value_len, const struct nls_table *nls_codepage, struct cifs_sb_info *cifs_sb) { + struct smb2_compound_vars *vars; struct cifs_ses *ses = tcon->ses; struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct smb_rqst *rqst; + struct kvec *rsp_iov; __le16 *utf16_path = NULL; int ea_name_len = strlen(ea_name); int flags = CIFS_CP_CREATE_CLOSE_OP; int len; - struct smb_rqst rqst[3]; int resp_buftype[3]; - struct kvec rsp_iov[3]; - struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; struct cifs_open_parms oparms; __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_fid fid; - struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; unsigned int size[1]; void *data[1]; struct smb2_file_full_ea_info *ea = NULL; - struct kvec close_iov[1]; struct smb2_query_info_rsp *rsp; int rc, used_len = 0; @@ -1113,9 +1119,14 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, if (!utf16_path) return -ENOMEM; - memset(rqst, 0, sizeof(rqst)); resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; - memset(rsp_iov, 0, sizeof(rsp_iov)); + vars = kzalloc(sizeof(*vars), GFP_KERNEL); + if (!vars) { + rc = -ENOMEM; + goto out_free_path; + } + rqst = vars->rqst; + rsp_iov = vars->rsp_iov; if (ses->server->ops->query_all_EAs) { if (!ea_value) { @@ -1150,7 +1161,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, /* Use a fudge factor of 256 bytes in case we collide * with a different set_EAs command. */ - if(CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE - + if (CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE - MAX_SMB2_CLOSE_RESPONSE_SIZE - 256 < used_len + ea_name_len + ea_value_len + 1) { rc = -ENOSPC; @@ -1160,8 +1171,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, } /* Open */ - memset(&open_iov, 0, sizeof(open_iov)); - rqst[0].rq_iov = open_iov; + rqst[0].rq_iov = vars->open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; oparms = (struct cifs_open_parms) { @@ -1181,8 +1191,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, /* Set Info */ - memset(&si_iov, 0, sizeof(si_iov)); - rqst[1].rq_iov = si_iov; + rqst[1].rq_iov = vars->si_iov; rqst[1].rq_nvec = 1; len = sizeof(*ea) + ea_name_len + ea_value_len + 1; @@ -1210,10 +1219,8 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, smb2_set_next_command(tcon, &rqst[1]); smb2_set_related(&rqst[1]); - /* Close */ - memset(&close_iov, 0, sizeof(close_iov)); - rqst[2].rq_iov = close_iov; + rqst[2].rq_iov = &vars->close_iov; rqst[2].rq_nvec = 1; rc = SMB2_close_init(tcon, server, &rqst[2], COMPOUND_FID, COMPOUND_FID, false); @@ -1228,13 +1235,15 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, sea_exit: kfree(ea); - kfree(utf16_path); SMB2_open_free(&rqst[0]); SMB2_set_info_free(&rqst[1]); SMB2_close_free(&rqst[2]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + kfree(vars); +out_free_path: + kfree(utf16_path); return rc; } #endif @@ -1396,7 +1405,8 @@ smb2_close_getattr(const unsigned int xid, struct cifs_tcon *tcon, if (file_inf.LastWriteTime) inode->i_mtime = cifs_NTtimeToUnix(file_inf.LastWriteTime); if (file_inf.ChangeTime) - inode->i_ctime = cifs_NTtimeToUnix(file_inf.ChangeTime); + inode_set_ctime_to_ts(inode, + cifs_NTtimeToUnix(file_inf.ChangeTime)); if (file_inf.LastAccessTime) inode->i_atime = cifs_NTtimeToUnix(file_inf.LastAccessTime); @@ -1445,16 +1455,6 @@ req_res_key_exit: return rc; } -struct iqi_vars { - struct smb_rqst rqst[3]; - struct kvec rsp_iov[3]; - struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; - struct kvec qi_iov[1]; - struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; - struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; - struct kvec close_iov[1]; -}; - static int smb2_ioctl_query_info(const unsigned int xid, struct cifs_tcon *tcon, @@ -1462,7 +1462,7 @@ smb2_ioctl_query_info(const unsigned int xid, __le16 *path, int is_dir, unsigned long p) { - struct iqi_vars *vars; + struct smb2_compound_vars *vars; struct smb_rqst *rqst; struct kvec *rsp_iov; struct cifs_ses *ses = tcon->ses; @@ -1580,7 +1580,7 @@ smb2_ioctl_query_info(const unsigned int xid, rc = -EINVAL; goto free_open_req; } - rqst[1].rq_iov = &vars->si_iov[0]; + rqst[1].rq_iov = vars->si_iov; rqst[1].rq_nvec = 1; /* MS-FSCC 2.4.13 FileEndOfFileInformation */ @@ -1592,7 +1592,7 @@ smb2_ioctl_query_info(const unsigned int xid, SMB2_O_INFO_FILE, 0, data, size); free_req1_func = SMB2_set_info_free; } else if (qi.flags == PASSTHRU_QUERY_INFO) { - rqst[1].rq_iov = &vars->qi_iov[0]; + rqst[1].rq_iov = &vars->qi_iov; rqst[1].rq_nvec = 1; rc = SMB2_query_info_init(tcon, server, @@ -1614,7 +1614,7 @@ smb2_ioctl_query_info(const unsigned int xid, smb2_set_related(&rqst[1]); /* Close */ - rqst[2].rq_iov = &vars->close_iov[0]; + rqst[2].rq_iov = &vars->close_iov; rqst[2].rq_nvec = 1; rc = SMB2_close_init(tcon, server, @@ -2407,7 +2407,7 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) return false; /* If server is a channel, select the primary channel */ - pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { @@ -2523,15 +2523,13 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, struct kvec *rsp, int *buftype, struct cifs_sb_info *cifs_sb) { + struct smb2_compound_vars *vars; struct cifs_ses *ses = tcon->ses; struct TCP_Server_Info *server = cifs_pick_channel(ses); int flags = CIFS_CP_CREATE_CLOSE_OP; - struct smb_rqst rqst[3]; + struct smb_rqst *rqst; int resp_buftype[3]; - struct kvec rsp_iov[3]; - struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; - struct kvec qi_iov[1]; - struct kvec close_iov[1]; + struct kvec *rsp_iov; u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_open_parms oparms; struct cifs_fid fid; @@ -2548,9 +2546,14 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - memset(rqst, 0, sizeof(rqst)); resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; - memset(rsp_iov, 0, sizeof(rsp_iov)); + vars = kzalloc(sizeof(*vars), GFP_KERNEL); + if (!vars) { + rc = -ENOMEM; + goto out_free_path; + } + rqst = vars->rqst; + rsp_iov = vars->rsp_iov; /* * We can only call this for things we know are directories. @@ -2559,8 +2562,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, open_cached_dir(xid, tcon, path, cifs_sb, false, &cfid); /* cfid null if open dir failed */ - memset(&open_iov, 0, sizeof(open_iov)); - rqst[0].rq_iov = open_iov; + rqst[0].rq_iov = vars->open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; oparms = (struct cifs_open_parms) { @@ -2578,8 +2580,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, goto qic_exit; smb2_set_next_command(tcon, &rqst[0]); - memset(&qi_iov, 0, sizeof(qi_iov)); - rqst[1].rq_iov = qi_iov; + rqst[1].rq_iov = &vars->qi_iov; rqst[1].rq_nvec = 1; if (cfid) { @@ -2606,8 +2607,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, smb2_set_related(&rqst[1]); } - memset(&close_iov, 0, sizeof(close_iov)); - rqst[2].rq_iov = close_iov; + rqst[2].rq_iov = &vars->close_iov; rqst[2].rq_nvec = 1; rc = SMB2_close_init(tcon, server, @@ -2638,7 +2638,6 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, *buftype = resp_buftype[1]; qic_exit: - kfree(utf16_path); SMB2_open_free(&rqst[0]); SMB2_query_info_free(&rqst[1]); SMB2_close_free(&rqst[2]); @@ -2646,6 +2645,9 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); if (cfid) close_cached_dir(cfid); + kfree(vars); +out_free_path: + kfree(utf16_path); return rc; } @@ -2681,6 +2683,7 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon, smb2_copy_fs_info_to_kstatfs(info, buf); qfs_exit: + trace_smb3_qfs_done(xid, tcon->tid, tcon->ses->Suid, tcon->tree_name, rc); free_rsp_buf(buftype, rsp_iov.iov_base); return rc; } @@ -2948,154 +2951,32 @@ parse_reparse_point(struct reparse_data_buffer *buf, } } -static int -smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - char **target_path, bool is_reparse_point) +static int smb2_query_symlink(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + char **target_path, + struct kvec *rsp_iov) { - int rc; - __le16 *utf16_path = NULL; - __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; - struct cifs_open_parms oparms; - struct cifs_fid fid; - struct kvec err_iov = {NULL, 0}; - struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); - int flags = CIFS_CP_CREATE_CLOSE_OP; - struct smb_rqst rqst[3]; - int resp_buftype[3]; - struct kvec rsp_iov[3]; - struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; - struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; - struct kvec close_iov[1]; - struct smb2_create_rsp *create_rsp; - struct smb2_ioctl_rsp *ioctl_rsp; - struct reparse_data_buffer *reparse_buf; - int create_options = is_reparse_point ? OPEN_REPARSE_POINT : 0; - u32 plen; + struct reparse_data_buffer *buf; + struct smb2_ioctl_rsp *io = rsp_iov->iov_base; + u32 plen = le32_to_cpu(io->OutputCount); cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); - *target_path = NULL; - - if (smb3_encryption_required(tcon)) - flags |= CIFS_TRANSFORM_REQ; - - memset(rqst, 0, sizeof(rqst)); - resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; - memset(rsp_iov, 0, sizeof(rsp_iov)); - - utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); - if (!utf16_path) - return -ENOMEM; - - /* Open */ - memset(&open_iov, 0, sizeof(open_iov)); - rqst[0].rq_iov = open_iov; - rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - - oparms = (struct cifs_open_parms) { - .tcon = tcon, - .path = full_path, - .desired_access = FILE_READ_ATTRIBUTES, - .disposition = FILE_OPEN, - .create_options = cifs_create_options(cifs_sb, create_options), - .fid = &fid, - }; - - rc = SMB2_open_init(tcon, server, - &rqst[0], &oplock, &oparms, utf16_path); - if (rc) - goto querty_exit; - smb2_set_next_command(tcon, &rqst[0]); - - - /* IOCTL */ - memset(&io_iov, 0, sizeof(io_iov)); - rqst[1].rq_iov = io_iov; - rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; - - rc = SMB2_ioctl_init(tcon, server, - &rqst[1], fid.persistent_fid, - fid.volatile_fid, FSCTL_GET_REPARSE_POINT, NULL, 0, - CIFSMaxBufSize - - MAX_SMB2_CREATE_RESPONSE_SIZE - - MAX_SMB2_CLOSE_RESPONSE_SIZE); - if (rc) - goto querty_exit; - - smb2_set_next_command(tcon, &rqst[1]); - smb2_set_related(&rqst[1]); - - - /* Close */ - memset(&close_iov, 0, sizeof(close_iov)); - rqst[2].rq_iov = close_iov; - rqst[2].rq_nvec = 1; - - rc = SMB2_close_init(tcon, server, - &rqst[2], COMPOUND_FID, COMPOUND_FID, false); - if (rc) - goto querty_exit; - - smb2_set_related(&rqst[2]); - - rc = compound_send_recv(xid, tcon->ses, server, - flags, 3, rqst, - resp_buftype, rsp_iov); - - create_rsp = rsp_iov[0].iov_base; - if (create_rsp && create_rsp->hdr.Status) - err_iov = rsp_iov[0]; - ioctl_rsp = rsp_iov[1].iov_base; - - /* - * Open was successful and we got an ioctl response. - */ - if ((rc == 0) && (is_reparse_point)) { - /* See MS-FSCC 2.3.23 */ - - reparse_buf = (struct reparse_data_buffer *) - ((char *)ioctl_rsp + - le32_to_cpu(ioctl_rsp->OutputOffset)); - plen = le32_to_cpu(ioctl_rsp->OutputCount); - - if (plen + le32_to_cpu(ioctl_rsp->OutputOffset) > - rsp_iov[1].iov_len) { - cifs_tcon_dbg(VFS, "srv returned invalid ioctl len: %d\n", - plen); - rc = -EIO; - goto querty_exit; - } - - rc = parse_reparse_point(reparse_buf, plen, target_path, - cifs_sb); - goto querty_exit; - } - - if (!rc || !err_iov.iov_base) { - rc = -ENOENT; - goto querty_exit; - } - - rc = smb2_parse_symlink_response(cifs_sb, &err_iov, target_path); - - querty_exit: - cifs_dbg(FYI, "query symlink rc %d\n", rc); - kfree(utf16_path); - SMB2_open_free(&rqst[0]); - SMB2_ioctl_free(&rqst[1]); - SMB2_close_free(&rqst[2]); - free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); - free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); - free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); - return rc; + buf = (struct reparse_data_buffer *)((u8 *)io + + le32_to_cpu(io->OutputOffset)); + return parse_reparse_point(buf, plen, target_path, cifs_sb); } -int -smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - __u32 *tag) +static int smb2_query_reparse_point(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + u32 *tag, struct kvec *rsp, + int *rsp_buftype) { + struct smb2_compound_vars *vars; int rc; __le16 *utf16_path = NULL; __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; @@ -3103,12 +2984,9 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid fid; struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); int flags = CIFS_CP_CREATE_CLOSE_OP; - struct smb_rqst rqst[3]; + struct smb_rqst *rqst; int resp_buftype[3]; - struct kvec rsp_iov[3]; - struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; - struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; - struct kvec close_iov[1]; + struct kvec *rsp_iov; struct smb2_ioctl_rsp *ioctl_rsp; struct reparse_data_buffer *reparse_buf; u32 plen; @@ -3118,20 +2996,24 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - memset(rqst, 0, sizeof(rqst)); - resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; - memset(rsp_iov, 0, sizeof(rsp_iov)); - utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); if (!utf16_path) return -ENOMEM; + resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; + vars = kzalloc(sizeof(*vars), GFP_KERNEL); + if (!vars) { + rc = -ENOMEM; + goto out_free_path; + } + rqst = vars->rqst; + rsp_iov = vars->rsp_iov; + /* * setup smb2open - TODO add optimization to call cifs_get_readable_path * to see if there is a handle already open that we can use */ - memset(&open_iov, 0, sizeof(open_iov)); - rqst[0].rq_iov = open_iov; + rqst[0].rq_iov = vars->open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; oparms = (struct cifs_open_parms) { @@ -3151,8 +3033,7 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, /* IOCTL */ - memset(&io_iov, 0, sizeof(io_iov)); - rqst[1].rq_iov = io_iov; + rqst[1].rq_iov = vars->io_iov; rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; rc = SMB2_ioctl_init(tcon, server, @@ -3167,10 +3048,8 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, smb2_set_next_command(tcon, &rqst[1]); smb2_set_related(&rqst[1]); - /* Close */ - memset(&close_iov, 0, sizeof(close_iov)); - rqst[2].rq_iov = close_iov; + rqst[2].rq_iov = &vars->close_iov; rqst[2].rq_nvec = 1; rc = SMB2_close_init(tcon, server, @@ -3205,16 +3084,21 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, goto query_rp_exit; } *tag = le32_to_cpu(reparse_buf->ReparseTag); + *rsp = rsp_iov[1]; + *rsp_buftype = resp_buftype[1]; + resp_buftype[1] = CIFS_NO_BUFFER; } query_rp_exit: - kfree(utf16_path); SMB2_open_free(&rqst[0]); SMB2_ioctl_free(&rqst[1]); SMB2_close_free(&rqst[2]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + kfree(vars); +out_free_path: + kfree(utf16_path); return rc; } @@ -4400,7 +4284,7 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) u8 *ses_enc_key; /* If server is a channel, select the primary channel */ - pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { @@ -4707,7 +4591,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, if (shdr->Command != SMB2_READ) { cifs_server_dbg(VFS, "only big read responses are supported\n"); - return -ENOTSUPP; + return -EOPNOTSUPP; } if (server->ops->is_session_expired && @@ -5298,6 +5182,7 @@ struct smb_version_operations smb20_operations = { .can_echo = smb2_can_echo, .echo = SMB2_echo, .query_path_info = smb2_query_path_info, + .query_reparse_point = smb2_query_reparse_point, .get_srv_inum = smb2_get_srv_inum, .query_file_info = smb2_query_file_info, .set_path_size = smb2_set_path_size, @@ -5399,6 +5284,7 @@ struct smb_version_operations smb21_operations = { .can_echo = smb2_can_echo, .echo = SMB2_echo, .query_path_info = smb2_query_path_info, + .query_reparse_point = smb2_query_reparse_point, .get_srv_inum = smb2_get_srv_inum, .query_file_info = smb2_query_file_info, .set_path_size = smb2_set_path_size, @@ -5503,7 +5389,7 @@ struct smb_version_operations smb30_operations = { .echo = SMB2_echo, .query_path_info = smb2_query_path_info, /* WSL tags introduced long after smb2.1, enable for SMB3, 3.11 only */ - .query_reparse_tag = smb2_query_reparse_tag, + .query_reparse_point = smb2_query_reparse_point, .get_srv_inum = smb2_get_srv_inum, .query_file_info = smb2_query_file_info, .set_path_size = smb2_set_path_size, @@ -5616,7 +5502,7 @@ struct smb_version_operations smb311_operations = { .can_echo = smb2_can_echo, .echo = SMB2_echo, .query_path_info = smb2_query_path_info, - .query_reparse_tag = smb2_query_reparse_tag, + .query_reparse_point = smb2_query_reparse_point, .get_srv_inum = smb2_get_srv_inum, .query_file_info = smb2_query_file_info, .set_path_size = smb2_set_path_size, diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index a457f07f820d..c75a80bb6d9e 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -88,10 +88,27 @@ smb2_hdr_assemble(struct smb2_hdr *shdr, __le16 smb2_cmd, const struct cifs_tcon *tcon, struct TCP_Server_Info *server) { + struct smb3_hdr_req *smb3_hdr; + shdr->ProtocolId = SMB2_PROTO_NUMBER; shdr->StructureSize = cpu_to_le16(64); shdr->Command = smb2_cmd; + if (server) { + /* After reconnect SMB3 must set ChannelSequence on subsequent reqs */ + if (server->dialect >= SMB30_PROT_ID) { + smb3_hdr = (struct smb3_hdr_req *)shdr; + /* + * if primary channel is not set yet, use default + * channel for chan sequence num + */ + if (SERVER_IS_CHAN(server)) + smb3_hdr->ChannelSequence = + cpu_to_le16(server->primary_server->channel_sequence_num); + else + smb3_hdr->ChannelSequence = + cpu_to_le16(server->channel_sequence_num); + } spin_lock(&server->req_lock); /* Request up to 10 credits but don't go over the limit. */ if (server->credits >= server->max_credits) @@ -553,7 +570,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, * secondary channels don't have the hostname field populated * use the hostname field in the primary channel instead */ - pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; cifs_server_lock(pserver); hostname = pserver->hostname; if (hostname && (hostname[0] != 0)) { @@ -831,7 +848,7 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode) iov[num].iov_base = create_posix_buf(mode); if (mode == ACL_NO_MODE) - cifs_dbg(FYI, "Invalid mode\n"); + cifs_dbg(FYI, "%s: no mode\n", __func__); if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct create_posix); @@ -2223,7 +2240,7 @@ create_durable_v2_buf(struct cifs_open_parms *oparms) * (most servers default to 120 seconds) and most clients default to 0. * This can be overridden at mount ("handletimeout=") if the user wants * a different persistent (or resilient) handle timeout for all opens - * opens on a particular SMB3 mount. + * on a particular SMB3 mount. */ buf->dcontext.Timeout = cpu_to_le32(oparms->tcon->handle_timeout); buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT); @@ -2368,7 +2385,7 @@ add_twarp_context(struct kvec *iov, unsigned int *num_iovec, __u64 timewarp) return 0; } -/* See See http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx */ +/* See http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx */ static void setup_owner_group_sids(char *buf) { struct owner_group_sids *sids = (struct owner_group_sids *)buf; @@ -2570,8 +2587,8 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len, /* Do not append the separator if the path is empty */ if (path[0] != cpu_to_le16(0x0000)) { - UniStrcat(*out_path, sep); - UniStrcat(*out_path, path); + UniStrcat((wchar_t *)*out_path, (wchar_t *)sep); + UniStrcat((wchar_t *)*out_path, (wchar_t *)path); } unload_nls(cp); @@ -3113,6 +3130,7 @@ void SMB2_ioctl_free(struct smb_rqst *rqst) { int i; + if (rqst && rqst->rq_iov) { cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ for (i = 1; i < rqst->rq_nvec; i++) @@ -3785,7 +3803,7 @@ void smb2_reconnect_server(struct work_struct *work) bool resched = false; /* If server is a channel, select the primary channel */ - pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; /* Prevent simultaneous reconnects that can corrupt tcon->rlist list */ mutex_lock(&pserver->reconnect_mutex); @@ -3860,7 +3878,7 @@ void smb2_reconnect_server(struct work_struct *work) goto done; /* allocate a dummy tcon struct used for reconnect */ - tcon = tconInfoAlloc(); + tcon = tcon_info_alloc(false); if (!tcon) { resched = true; list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) { diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index d5d7ffb7711c..46eff9ec302a 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -56,9 +56,11 @@ extern int smb3_handle_read_data(struct TCP_Server_Info *server, extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *path, __u32 *reparse_tag); -int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse); +int smb2_query_path_info(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + struct cifs_open_info_data *data); extern int smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, const char *full_path, __u64 size, struct cifs_sb_info *cifs_sb, bool set_alloc); @@ -275,12 +277,13 @@ extern int smb2_query_info_compound(const unsigned int xid, struct kvec *rsp, int *buftype, struct cifs_sb_info *cifs_sb); /* query path info from the server using SMB311 POSIX extensions*/ -int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, +int smb311_posix_query_path_info(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, struct cifs_open_info_data *data, struct cifs_sid *owner, - struct cifs_sid *group, - bool *adjust_tz, bool *reparse); + struct cifs_sid *group); int posix_info_parse(const void *beg, const void *end, struct smb2_posix_info_parsed *out); int posix_info_sid_size(const void *beg, const void *end); diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index 7676091b3e77..23c50ed7d4b5 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -86,7 +86,7 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) spin_lock(&cifs_tcp_ses_lock); /* If server is a channel, select the primary channel */ - pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { if (ses->Suid == ses_id) @@ -149,7 +149,7 @@ smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id) struct cifs_ses *ses; /* If server is a channel, select the primary channel */ - pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { if (ses->Suid != ses_id) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 2a2aec8c6112..94df9eec3d8d 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1401,10 +1401,13 @@ create_conn: server->smbd_conn = smbd_get_connection( server, (struct sockaddr *) &server->dstaddr); - if (server->smbd_conn) + if (server->smbd_conn) { cifs_dbg(VFS, "RDMA transport re-established\n"); - - return server->smbd_conn ? 0 : -ENOENT; + trace_smb3_smbd_connect_done(server->hostname, server->conn_id, &server->dstaddr); + return 0; + } + trace_smb3_smbd_connect_err(server->hostname, server->conn_id, &server->dstaddr); + return -ENOENT; } static void destroy_caches_and_workqueue(struct smbd_connection *info) diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index e671bd16f00c..de199ec9f726 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -691,7 +691,7 @@ DEFINE_EVENT(smb3_tcon_class, smb3_##name, \ TP_ARGS(xid, tid, sesid, unc_name, rc)) DEFINE_SMB3_TCON_EVENT(tcon); - +DEFINE_SMB3_TCON_EVENT(qfs_done); /* * For smb2/smb3 open (including create and mkdir) calls @@ -935,6 +935,8 @@ DEFINE_EVENT(smb3_connect_class, smb3_##name, \ TP_ARGS(hostname, conn_id, addr)) DEFINE_SMB3_CONNECT_EVENT(connect_done); +DEFINE_SMB3_CONNECT_EVENT(smbd_connect_done); +DEFINE_SMB3_CONNECT_EVENT(smbd_connect_err); DECLARE_EVENT_CLASS(smb3_connect_err_class, TP_PROTO(char *hostname, __u64 conn_id, diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index f280502a2aee..14710afdc2a3 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -18,7 +18,7 @@ #include <linux/bvec.h> #include <linux/highmem.h> #include <linux/uaccess.h> -#include <asm/processor.h> +#include <linux/processor.h> #include <linux/mempool.h> #include <linux/sched/signal.h> #include <linux/task_io_accounting_ops.h> @@ -35,6 +35,8 @@ void cifs_wake_up_task(struct mid_q_entry *mid) { + if (mid->mid_state == MID_RESPONSE_RECEIVED) + mid->mid_state = MID_RESPONSE_READY; wake_up_process(mid->callback_data); } @@ -87,7 +89,8 @@ static void __release_mid(struct kref *refcount) struct TCP_Server_Info *server = midEntry->server; if (midEntry->resp_buf && (midEntry->mid_flags & MID_WAIT_CANCELLED) && - midEntry->mid_state == MID_RESPONSE_RECEIVED && + (midEntry->mid_state == MID_RESPONSE_RECEIVED || + midEntry->mid_state == MID_RESPONSE_READY) && server->ops->handle_cancelled_mid) server->ops->handle_cancelled_mid(midEntry, server); @@ -416,13 +419,19 @@ out: return rc; } +struct send_req_vars { + struct smb2_transform_hdr tr_hdr; + struct smb_rqst rqst[MAX_COMPOUND]; + struct kvec iov; +}; + static int smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, struct smb_rqst *rqst, int flags) { - struct kvec iov; - struct smb2_transform_hdr *tr_hdr; - struct smb_rqst cur_rqst[MAX_COMPOUND]; + struct send_req_vars *vars; + struct smb_rqst *cur_rqst; + struct kvec *iov; int rc; if (!(flags & CIFS_TRANSFORM_REQ)) @@ -436,16 +445,15 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, return -EIO; } - tr_hdr = kzalloc(sizeof(*tr_hdr), GFP_NOFS); - if (!tr_hdr) + vars = kzalloc(sizeof(*vars), GFP_NOFS); + if (!vars) return -ENOMEM; + cur_rqst = vars->rqst; + iov = &vars->iov; - memset(&cur_rqst[0], 0, sizeof(cur_rqst)); - memset(&iov, 0, sizeof(iov)); - - iov.iov_base = tr_hdr; - iov.iov_len = sizeof(*tr_hdr); - cur_rqst[0].rq_iov = &iov; + iov->iov_base = &vars->tr_hdr; + iov->iov_len = sizeof(vars->tr_hdr); + cur_rqst[0].rq_iov = iov; cur_rqst[0].rq_nvec = 1; rc = server->ops->init_transform_rq(server, num_rqst + 1, @@ -456,7 +464,7 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, rc = __smb_send_rqst(server, num_rqst + 1, &cur_rqst[0]); smb3_free_compound_rqst(num_rqst, &cur_rqst[1]); out: - kfree(tr_hdr); + kfree(vars); return rc; } @@ -732,7 +740,8 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ) int error; error = wait_event_state(server->response_q, - midQ->mid_state != MID_REQUEST_SUBMITTED, + midQ->mid_state != MID_REQUEST_SUBMITTED && + midQ->mid_state != MID_RESPONSE_RECEIVED, (TASK_KILLABLE|TASK_FREEZABLE_UNSAFE)); if (error < 0) return -ERESTARTSYS; @@ -885,7 +894,7 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) spin_lock(&server->mid_lock); switch (mid->mid_state) { - case MID_RESPONSE_RECEIVED: + case MID_RESPONSE_READY: spin_unlock(&server->mid_lock); return rc; case MID_RETRY_NEEDED: @@ -984,6 +993,9 @@ cifs_compound_callback(struct mid_q_entry *mid) credits.instance = server->reconnect_instance; add_credits(server, &credits, mid->optype); + + if (mid->mid_state == MID_RESPONSE_RECEIVED) + mid->mid_state = MID_RESPONSE_READY; } static void @@ -1204,7 +1216,8 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, send_cancel(server, &rqst[i], midQ[i]); spin_lock(&server->mid_lock); midQ[i]->mid_flags |= MID_WAIT_CANCELLED; - if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) { + if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED || + midQ[i]->mid_state == MID_RESPONSE_RECEIVED) { midQ[i]->callback = cifs_cancelled_callback; cancelled_mid[i] = true; credits[i].value = 0; @@ -1225,7 +1238,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, } if (!midQ[i]->resp_buf || - midQ[i]->mid_state != MID_RESPONSE_RECEIVED) { + midQ[i]->mid_state != MID_RESPONSE_READY) { rc = -EIO; cifs_dbg(FYI, "Bad MID state?\n"); goto out; @@ -1412,7 +1425,8 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, if (rc != 0) { send_cancel(server, &rqst, midQ); spin_lock(&server->mid_lock); - if (midQ->mid_state == MID_REQUEST_SUBMITTED) { + if (midQ->mid_state == MID_REQUEST_SUBMITTED || + midQ->mid_state == MID_RESPONSE_RECEIVED) { /* no longer considered to be "in-flight" */ midQ->callback = release_mid; spin_unlock(&server->mid_lock); @@ -1429,7 +1443,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, } if (!midQ->resp_buf || !out_buf || - midQ->mid_state != MID_RESPONSE_RECEIVED) { + midQ->mid_state != MID_RESPONSE_READY) { rc = -EIO; cifs_server_dbg(VFS, "Bad MID state?\n"); goto out; @@ -1553,14 +1567,16 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, /* Wait for a reply - allow signals to interrupt. */ rc = wait_event_interruptible(server->response_q, - (!(midQ->mid_state == MID_REQUEST_SUBMITTED)) || + (!(midQ->mid_state == MID_REQUEST_SUBMITTED || + midQ->mid_state == MID_RESPONSE_RECEIVED)) || ((server->tcpStatus != CifsGood) && (server->tcpStatus != CifsNew))); /* Were we interrupted by a signal ? */ spin_lock(&server->srv_lock); if ((rc == -ERESTARTSYS) && - (midQ->mid_state == MID_REQUEST_SUBMITTED) && + (midQ->mid_state == MID_REQUEST_SUBMITTED || + midQ->mid_state == MID_RESPONSE_RECEIVED) && ((server->tcpStatus == CifsGood) || (server->tcpStatus == CifsNew))) { spin_unlock(&server->srv_lock); @@ -1591,7 +1607,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, if (rc) { send_cancel(server, &rqst, midQ); spin_lock(&server->mid_lock); - if (midQ->mid_state == MID_REQUEST_SUBMITTED) { + if (midQ->mid_state == MID_REQUEST_SUBMITTED || + midQ->mid_state == MID_RESPONSE_RECEIVED) { /* no longer considered to be "in-flight" */ midQ->callback = release_mid; spin_unlock(&server->mid_lock); @@ -1611,7 +1628,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, return rc; /* rcvd frame is ok */ - if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_RECEIVED) { + if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_READY) { rc = -EIO; cifs_tcon_dbg(VFS, "Bad MID state?\n"); goto out; diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index bae590eec871..319fb9ffc6a0 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -153,6 +153,28 @@ struct smb2_hdr { __u8 Signature[16]; } __packed; +struct smb3_hdr_req { + __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */ + __le16 StructureSize; /* 64 */ + __le16 CreditCharge; /* MBZ */ + __le16 ChannelSequence; /* See MS-SMB2 3.2.4.1 and 3.2.7.1 */ + __le16 Reserved; + __le16 Command; + __le16 CreditRequest; /* CreditResponse */ + __le32 Flags; + __le32 NextCommand; + __le64 MessageId; + union { + struct { + __le32 ProcessId; + __le32 TreeId; + } __packed SyncId; + __le64 AsyncId; + } __packed Id; + __le64 SessionId; + __u8 Signature[16]; +} __packed; + struct smb2_pdu { struct smb2_hdr hdr; __le16 StructureSize2; /* size of wct area (varies, request specific) */ @@ -384,7 +406,7 @@ struct smb2_tree_disconnect_rsp { /* Capabilities flags */ #define SMB2_GLOBAL_CAP_DFS 0x00000001 #define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */ -#define SMB2_GLOBAL_CAP_LARGE_MTU 0X00000004 /* Resp only New to SMB2.1 */ +#define SMB2_GLOBAL_CAP_LARGE_MTU 0x00000004 /* Resp only New to SMB2.1 */ #define SMB2_GLOBAL_CAP_MULTI_CHANNEL 0x00000008 /* New to SMB3 */ #define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */ #define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */ diff --git a/fs/smb/server/Kconfig b/fs/smb/server/Kconfig index 7055cb5d2880..cabe6a843c6a 100644 --- a/fs/smb/server/Kconfig +++ b/fs/smb/server/Kconfig @@ -1,10 +1,11 @@ config SMB_SERVER - tristate "SMB3 server support (EXPERIMENTAL)" + tristate "SMB3 server support" depends on INET depends on MULTIUSER depends on FILE_LOCKING select NLS select NLS_UTF8 + select NLS_UCS2_UTILS select CRYPTO select CRYPTO_MD5 select CRYPTO_HMAC diff --git a/fs/smb/server/asn1.c b/fs/smb/server/asn1.c index cc6384f79675..4a4b2b03ff33 100644 --- a/fs/smb/server/asn1.c +++ b/fs/smb/server/asn1.c @@ -214,12 +214,10 @@ static int ksmbd_neg_token_alloc(void *context, size_t hdrlen, { struct ksmbd_conn *conn = context; - conn->mechToken = kmalloc(vlen + 1, GFP_KERNEL); + conn->mechToken = kmemdup_nul(value, vlen, GFP_KERNEL); if (!conn->mechToken) return -ENOMEM; - memcpy(conn->mechToken, value, vlen); - conn->mechToken[vlen] = '\0'; return 0; } diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c index 5e5e120edcc2..229a6527870d 100644 --- a/fs/smb/server/auth.c +++ b/fs/smb/server/auth.c @@ -355,6 +355,9 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, if (blob_len < (u64)sess_key_off + sess_key_len) return -EINVAL; + if (sess_key_len > CIFS_KEY_SIZE) + return -EINVAL; + ctx_arc4 = kmalloc(sizeof(*ctx_arc4), GFP_KERNEL); if (!ctx_arc4) return -ENOMEM; @@ -1029,11 +1032,15 @@ static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec, { struct scatterlist *sg; unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20; - int i, nr_entries[3] = {0}, total_entries = 0, sg_idx = 0; + int i, *nr_entries, total_entries = 0, sg_idx = 0; if (!nvec) return NULL; + nr_entries = kcalloc(nvec, sizeof(int), GFP_KERNEL); + if (!nr_entries) + return NULL; + for (i = 0; i < nvec - 1; i++) { unsigned long kaddr = (unsigned long)iov[i + 1].iov_base; @@ -1051,8 +1058,10 @@ static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec, total_entries += 2; sg = kmalloc_array(total_entries, sizeof(struct scatterlist), GFP_KERNEL); - if (!sg) + if (!sg) { + kfree(nr_entries); return NULL; + } sg_init_table(sg, total_entries); smb2_sg_set_buf(&sg[sg_idx++], iov[0].iov_base + 24, assoc_data_len); @@ -1086,6 +1095,7 @@ static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec, } } smb2_sg_set_buf(&sg[sg_idx], sign, SMB2_SIGNATURE_SIZE); + kfree(nr_entries); return sg; } diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index 2a717d158f02..0d990c2f33cd 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -123,28 +123,22 @@ void ksmbd_conn_enqueue_request(struct ksmbd_work *work) } } -int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work) +void ksmbd_conn_try_dequeue_request(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; - int ret = 1; if (list_empty(&work->request_entry) && list_empty(&work->async_request_entry)) - return 0; + return; - if (!work->multiRsp) - atomic_dec(&conn->req_running); - if (!work->multiRsp) { - spin_lock(&conn->request_lock); - list_del_init(&work->request_entry); - spin_unlock(&conn->request_lock); - if (work->asynchronous) - release_async_work(work); - ret = 0; - } + atomic_dec(&conn->req_running); + spin_lock(&conn->request_lock); + list_del_init(&work->request_entry); + spin_unlock(&conn->request_lock); + if (work->asynchronous) + release_async_work(work); wake_up_all(&conn->req_running_q); - return ret; } void ksmbd_conn_lock(struct ksmbd_conn *conn) @@ -193,41 +187,22 @@ void ksmbd_conn_wait_idle(struct ksmbd_conn *conn, u64 sess_id) int ksmbd_conn_write(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; - size_t len = 0; int sent; - struct kvec iov[3]; - int iov_idx = 0; if (!work->response_buf) { pr_err("NULL response header\n"); return -EINVAL; } - if (work->tr_buf) { - iov[iov_idx] = (struct kvec) { work->tr_buf, - sizeof(struct smb2_transform_hdr) + 4 }; - len += iov[iov_idx++].iov_len; - } - - if (work->aux_payload_sz) { - iov[iov_idx] = (struct kvec) { work->response_buf, work->resp_hdr_sz }; - len += iov[iov_idx++].iov_len; - iov[iov_idx] = (struct kvec) { work->aux_payload_buf, work->aux_payload_sz }; - len += iov[iov_idx++].iov_len; - } else { - if (work->tr_buf) - iov[iov_idx].iov_len = work->resp_hdr_sz; - else - iov[iov_idx].iov_len = get_rfc1002_len(work->response_buf) + 4; - iov[iov_idx].iov_base = work->response_buf; - len += iov[iov_idx++].iov_len; - } + if (work->send_no_response) + return 0; ksmbd_conn_lock(conn); - sent = conn->transport->ops->writev(conn->transport, &iov[0], - iov_idx, len, - work->need_invalidate_rkey, - work->remote_key); + sent = conn->transport->ops->writev(conn->transport, work->iov, + work->iov_cnt, + get_rfc1002_len(work->iov[0].iov_base) + 4, + work->need_invalidate_rkey, + work->remote_key); ksmbd_conn_unlock(conn); if (sent < 0) { diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index ad8dfaa48ffb..ab2583f030ce 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -158,7 +158,7 @@ int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, struct smb2_buffer_desc_v1 *desc, unsigned int desc_len); void ksmbd_conn_enqueue_request(struct ksmbd_work *work); -int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work); +void ksmbd_conn_try_dequeue_request(struct ksmbd_work *work); void ksmbd_conn_init_server_callbacks(struct ksmbd_conn_ops *ops); int ksmbd_conn_handler_loop(void *p); int ksmbd_conn_transport_init(void); diff --git a/fs/smb/server/ksmbd_work.c b/fs/smb/server/ksmbd_work.c index 14b9caebf7a4..51def3ca74c0 100644 --- a/fs/smb/server/ksmbd_work.c +++ b/fs/smb/server/ksmbd_work.c @@ -27,18 +27,35 @@ struct ksmbd_work *ksmbd_alloc_work_struct(void) INIT_LIST_HEAD(&work->async_request_entry); INIT_LIST_HEAD(&work->fp_entry); INIT_LIST_HEAD(&work->interim_entry); + INIT_LIST_HEAD(&work->aux_read_list); + work->iov_alloc_cnt = 4; + work->iov = kcalloc(work->iov_alloc_cnt, sizeof(struct kvec), + GFP_KERNEL); + if (!work->iov) { + kmem_cache_free(work_cache, work); + work = NULL; + } } return work; } void ksmbd_free_work_struct(struct ksmbd_work *work) { + struct aux_read *ar, *tmp; + WARN_ON(work->saved_cred != NULL); kvfree(work->response_buf); - kvfree(work->aux_payload_buf); + + list_for_each_entry_safe(ar, tmp, &work->aux_read_list, entry) { + kvfree(ar->buf); + list_del(&ar->entry); + kfree(ar); + } + kfree(work->tr_buf); kvfree(work->request_buf); + kfree(work->iov); if (work->async_id) ksmbd_release_id(&work->conn->async_ida, work->async_id); kmem_cache_free(work_cache, work); @@ -77,3 +94,77 @@ bool ksmbd_queue_work(struct ksmbd_work *work) { return queue_work(ksmbd_wq, &work->work); } + +static int ksmbd_realloc_iov_pin(struct ksmbd_work *work, void *ib, + unsigned int ib_len) +{ + + if (work->iov_alloc_cnt <= work->iov_cnt) { + struct kvec *new; + + work->iov_alloc_cnt += 4; + new = krealloc(work->iov, + sizeof(struct kvec) * work->iov_alloc_cnt, + GFP_KERNEL | __GFP_ZERO); + if (!new) + return -ENOMEM; + work->iov = new; + } + + work->iov[++work->iov_idx].iov_base = ib; + work->iov[work->iov_idx].iov_len = ib_len; + work->iov_cnt++; + + return 0; +} + +static int __ksmbd_iov_pin_rsp(struct ksmbd_work *work, void *ib, int len, + void *aux_buf, unsigned int aux_size) +{ + /* Plus rfc_length size on first iov */ + if (!work->iov_idx) { + work->iov[work->iov_idx].iov_base = work->response_buf; + *(__be32 *)work->iov[0].iov_base = 0; + work->iov[work->iov_idx].iov_len = 4; + work->iov_cnt++; + } + + ksmbd_realloc_iov_pin(work, ib, len); + inc_rfc1001_len(work->iov[0].iov_base, len); + + if (aux_size) { + struct aux_read *ar; + + ksmbd_realloc_iov_pin(work, aux_buf, aux_size); + inc_rfc1001_len(work->iov[0].iov_base, aux_size); + + ar = kmalloc(sizeof(struct aux_read), GFP_KERNEL); + if (!ar) + return -ENOMEM; + + ar->buf = aux_buf; + list_add(&ar->entry, &work->aux_read_list); + } + + return 0; +} + +int ksmbd_iov_pin_rsp(struct ksmbd_work *work, void *ib, int len) +{ + return __ksmbd_iov_pin_rsp(work, ib, len, NULL, 0); +} + +int ksmbd_iov_pin_rsp_read(struct ksmbd_work *work, void *ib, int len, + void *aux_buf, unsigned int aux_size) +{ + return __ksmbd_iov_pin_rsp(work, ib, len, aux_buf, aux_size); +} + +int allocate_interim_rsp_buf(struct ksmbd_work *work) +{ + work->response_buf = kzalloc(MAX_CIFS_SMALL_BUFFER_SIZE, GFP_KERNEL); + if (!work->response_buf) + return -ENOMEM; + work->response_sz = MAX_CIFS_SMALL_BUFFER_SIZE; + return 0; +} diff --git a/fs/smb/server/ksmbd_work.h b/fs/smb/server/ksmbd_work.h index f8ae6144c0ae..8ca2c813246e 100644 --- a/fs/smb/server/ksmbd_work.h +++ b/fs/smb/server/ksmbd_work.h @@ -19,6 +19,11 @@ enum { KSMBD_WORK_CLOSED, }; +struct aux_read { + void *buf; + struct list_head entry; +}; + /* one of these for every pending CIFS request at the connection */ struct ksmbd_work { /* Server corresponding to this mid */ @@ -31,13 +36,19 @@ struct ksmbd_work { /* Response buffer */ void *response_buf; - /* Read data buffer */ - void *aux_payload_buf; + struct list_head aux_read_list; + + struct kvec *iov; + int iov_alloc_cnt; + int iov_cnt; + int iov_idx; /* Next cmd hdr in compound req buf*/ int next_smb2_rcv_hdr_off; /* Next cmd hdr in compound rsp buf*/ int next_smb2_rsp_hdr_off; + /* Current cmd hdr in compound rsp buf*/ + int curr_smb2_rsp_hdr_off; /* * Current Local FID assigned compound response if SMB2 CREATE @@ -53,16 +64,11 @@ struct ksmbd_work { unsigned int credits_granted; /* response smb header size */ - unsigned int resp_hdr_sz; unsigned int response_sz; - /* Read data count */ - unsigned int aux_payload_sz; void *tr_buf; unsigned char state; - /* Multiple responses for one request e.g. SMB ECHO */ - bool multiRsp:1; /* No response for cancelled request */ bool send_no_response:1; /* Request is encrypted */ @@ -96,6 +102,15 @@ static inline void *ksmbd_resp_buf_next(struct ksmbd_work *work) } /** + * ksmbd_resp_buf_curr - Get current buffer on compound response. + * @work: smb work containing response buffer + */ +static inline void *ksmbd_resp_buf_curr(struct ksmbd_work *work) +{ + return work->response_buf + work->curr_smb2_rsp_hdr_off + 4; +} + +/** * ksmbd_req_buf_next - Get next buffer on compound request. * @work: smb work containing response buffer */ @@ -113,5 +128,8 @@ int ksmbd_work_pool_init(void); int ksmbd_workqueue_init(void); void ksmbd_workqueue_destroy(void); bool ksmbd_queue_work(struct ksmbd_work *work); - +int ksmbd_iov_pin_rsp_read(struct ksmbd_work *work, void *ib, int len, + void *aux_buf, unsigned int aux_size); +int ksmbd_iov_pin_rsp(struct ksmbd_work *work, void *ib, int len); +int allocate_interim_rsp_buf(struct ksmbd_work *work); #endif /* __KSMBD_WORK_H__ */ diff --git a/fs/smb/server/mgmt/share_config.h b/fs/smb/server/mgmt/share_config.h index 3fd338293942..5f591751b923 100644 --- a/fs/smb/server/mgmt/share_config.h +++ b/fs/smb/server/mgmt/share_config.h @@ -34,29 +34,22 @@ struct ksmbd_share_config { #define KSMBD_SHARE_INVALID_UID ((__u16)-1) #define KSMBD_SHARE_INVALID_GID ((__u16)-1) -static inline int share_config_create_mode(struct ksmbd_share_config *share, - umode_t posix_mode) +static inline umode_t +share_config_create_mode(struct ksmbd_share_config *share, + umode_t posix_mode) { - if (!share->force_create_mode) { - if (!posix_mode) - return share->create_mask; - else - return posix_mode & share->create_mask; - } - return share->force_create_mode & share->create_mask; + umode_t mode = (posix_mode ?: (umode_t)-1) & share->create_mask; + + return mode | share->force_create_mode; } -static inline int share_config_directory_mode(struct ksmbd_share_config *share, - umode_t posix_mode) +static inline umode_t +share_config_directory_mode(struct ksmbd_share_config *share, + umode_t posix_mode) { - if (!share->force_directory_mode) { - if (!posix_mode) - return share->directory_mask; - else - return posix_mode & share->directory_mask; - } + umode_t mode = (posix_mode ?: (umode_t)-1) & share->directory_mask; - return share->force_directory_mode & share->directory_mask; + return mode | share->force_directory_mode; } static inline int test_share_config_flag(struct ksmbd_share_config *share, diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index 844b303baf29..9bc0103720f5 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -616,15 +616,6 @@ static int oplock_break_pending(struct oplock_info *opinfo, int req_op_level) return 0; } -static inline int allocate_oplock_break_buf(struct ksmbd_work *work) -{ - work->response_buf = kzalloc(MAX_CIFS_SMALL_BUFFER_SIZE, GFP_KERNEL); - if (!work->response_buf) - return -ENOMEM; - work->response_sz = MAX_CIFS_SMALL_BUFFER_SIZE; - return 0; -} - /** * __smb2_oplock_break_noti() - send smb2 oplock break cmd from conn * to client @@ -639,7 +630,6 @@ static void __smb2_oplock_break_noti(struct work_struct *wk) { struct smb2_oplock_break *rsp = NULL; struct ksmbd_work *work = container_of(wk, struct ksmbd_work, work); - struct ksmbd_conn *conn = work->conn; struct oplock_break_info *br_info = work->request_buf; struct smb2_hdr *rsp_hdr; struct ksmbd_file *fp; @@ -648,7 +638,7 @@ static void __smb2_oplock_break_noti(struct work_struct *wk) if (!fp) goto out; - if (allocate_oplock_break_buf(work)) { + if (allocate_interim_rsp_buf(work)) { pr_err("smb2_allocate_rsp_buf failed! "); ksmbd_fd_put(work, fp); goto out; @@ -656,8 +646,6 @@ static void __smb2_oplock_break_noti(struct work_struct *wk) rsp_hdr = smb2_get_msg(work->response_buf); memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2); - *(__be32 *)work->response_buf = - cpu_to_be32(conn->vals->header_size); rsp_hdr->ProtocolId = SMB2_PROTO_NUMBER; rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE; rsp_hdr->CreditRequest = cpu_to_le16(0); @@ -684,13 +672,15 @@ static void __smb2_oplock_break_noti(struct work_struct *wk) rsp->PersistentFid = fp->persistent_id; rsp->VolatileFid = fp->volatile_id; - inc_rfc1001_len(work->response_buf, 24); + ksmbd_fd_put(work, fp); + if (ksmbd_iov_pin_rsp(work, (void *)rsp, + sizeof(struct smb2_oplock_break))) + goto out; ksmbd_debug(OPLOCK, "sending oplock break v_id %llu p_id = %llu lock level = %d\n", rsp->VolatileFid, rsp->PersistentFid, rsp->OplockLevel); - ksmbd_fd_put(work, fp); ksmbd_conn_write(work); out: @@ -751,18 +741,15 @@ static void __smb2_lease_break_noti(struct work_struct *wk) struct smb2_lease_break *rsp = NULL; struct ksmbd_work *work = container_of(wk, struct ksmbd_work, work); struct lease_break_info *br_info = work->request_buf; - struct ksmbd_conn *conn = work->conn; struct smb2_hdr *rsp_hdr; - if (allocate_oplock_break_buf(work)) { + if (allocate_interim_rsp_buf(work)) { ksmbd_debug(OPLOCK, "smb2_allocate_rsp_buf failed! "); goto out; } rsp_hdr = smb2_get_msg(work->response_buf); memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2); - *(__be32 *)work->response_buf = - cpu_to_be32(conn->vals->header_size); rsp_hdr->ProtocolId = SMB2_PROTO_NUMBER; rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE; rsp_hdr->CreditRequest = cpu_to_le16(0); @@ -791,7 +778,9 @@ static void __smb2_lease_break_noti(struct work_struct *wk) rsp->AccessMaskHint = 0; rsp->ShareMaskHint = 0; - inc_rfc1001_len(work->response_buf, 44); + if (ksmbd_iov_pin_rsp(work, (void *)rsp, + sizeof(struct smb2_lease_break))) + goto out; ksmbd_conn_write(work); @@ -1492,7 +1481,7 @@ struct create_context *smb2_find_context_vals(void *open_req, const char *tag, i name_len < 4 || name_off + name_len > cc_len || (value_off & 0x7) != 0 || - (value_off && (value_off < name_off + name_len)) || + (value_len && value_off < name_off + (name_len < 8 ? 8 : name_len)) || ((u64)value_off + value_len > cc_len)) return ERR_PTR(-EINVAL); diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index 9df121bdf349..5ab2f52f9b35 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -163,6 +163,7 @@ static void __handle_ksmbd_work(struct ksmbd_work *work, { u16 command = 0; int rc; + bool is_chained = false; if (conn->ops->allocate_rsp_buf(work)) return; @@ -229,14 +230,13 @@ static void __handle_ksmbd_work(struct ksmbd_work *work, } } + is_chained = is_chained_smb2_message(work); + if (work->sess && (work->sess->sign || smb3_11_final_sess_setup_resp(work) || conn->ops->is_sign_req(work, command))) conn->ops->set_sign_rsp(work); - } while (is_chained_smb2_message(work)); - - if (work->send_no_response) - return; + } while (is_chained == true); send: smb3_preauth_hash_rsp(work); @@ -590,8 +590,6 @@ static int __init ksmbd_server_init(void) if (ret) goto err_crypto_destroy; - pr_warn_once("The ksmbd server is experimental\n"); - return 0; err_crypto_destroy: diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 7cc1b0c47d0a..544022dd6d20 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -145,12 +145,18 @@ void smb2_set_err_rsp(struct ksmbd_work *work) err_rsp = smb2_get_msg(work->response_buf); if (err_rsp->hdr.Status != STATUS_STOPPED_ON_SYMLINK) { + int err; + err_rsp->StructureSize = SMB2_ERROR_STRUCTURE_SIZE2_LE; err_rsp->ErrorContextCount = 0; err_rsp->Reserved = 0; err_rsp->ByteCount = 0; err_rsp->ErrorData[0] = 0; - inc_rfc1001_len(work->response_buf, SMB2_ERROR_STRUCTURE_SIZE2); + err = ksmbd_iov_pin_rsp(work, (void *)err_rsp, + __SMB2_HEADER_STRUCTURE_SIZE + + SMB2_ERROR_STRUCTURE_SIZE2); + if (err) + work->send_no_response = 1; } } @@ -245,9 +251,7 @@ int init_smb2_neg_rsp(struct ksmbd_work *work) struct smb2_hdr *rsp_hdr; struct smb2_negotiate_rsp *rsp; struct ksmbd_conn *conn = work->conn; - - *(__be32 *)work->response_buf = - cpu_to_be32(conn->vals->header_size); + int err; rsp_hdr = smb2_get_msg(work->response_buf); memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2); @@ -286,12 +290,13 @@ int init_smb2_neg_rsp(struct ksmbd_work *work) rsp->SecurityBufferLength = cpu_to_le16(AUTH_GSS_LENGTH); ksmbd_copy_gss_neg_header((char *)(&rsp->hdr) + le16_to_cpu(rsp->SecurityBufferOffset)); - inc_rfc1001_len(work->response_buf, - sizeof(struct smb2_negotiate_rsp) - - sizeof(struct smb2_hdr) + AUTH_GSS_LENGTH); rsp->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED_LE; if (server_conf.signing == KSMBD_CONFIG_OPT_MANDATORY) rsp->SecurityMode |= SMB2_NEGOTIATE_SIGNING_REQUIRED_LE; + err = ksmbd_iov_pin_rsp(work, rsp, + sizeof(struct smb2_negotiate_rsp) + AUTH_GSS_LENGTH); + if (err) + return err; conn->use_spnego = true; ksmbd_conn_set_need_negotiate(conn); @@ -390,11 +395,12 @@ static void init_chained_smb2_rsp(struct ksmbd_work *work) next_hdr_offset = le32_to_cpu(req->NextCommand); new_len = ALIGN(len, 8); - inc_rfc1001_len(work->response_buf, - sizeof(struct smb2_hdr) + new_len - len); + work->iov[work->iov_idx].iov_len += (new_len - len); + inc_rfc1001_len(work->response_buf, new_len - len); rsp->NextCommand = cpu_to_le32(new_len); work->next_smb2_rcv_hdr_off += next_hdr_offset; + work->curr_smb2_rsp_hdr_off = work->next_smb2_rsp_hdr_off; work->next_smb2_rsp_hdr_off += new_len; ksmbd_debug(SMB, "Compound req new_len = %d rcv off = %d rsp off = %d\n", @@ -470,10 +476,10 @@ bool is_chained_smb2_message(struct ksmbd_work *work) len = len - get_rfc1002_len(work->response_buf); if (len) { ksmbd_debug(SMB, "padding len %u\n", len); + work->iov[work->iov_idx].iov_len += len; inc_rfc1001_len(work->response_buf, len); - if (work->aux_payload_sz) - work->aux_payload_sz += len; } + work->curr_smb2_rsp_hdr_off = work->next_smb2_rsp_hdr_off; } return false; } @@ -488,11 +494,8 @@ int init_smb2_rsp_hdr(struct ksmbd_work *work) { struct smb2_hdr *rsp_hdr = smb2_get_msg(work->response_buf); struct smb2_hdr *rcv_hdr = smb2_get_msg(work->request_buf); - struct ksmbd_conn *conn = work->conn; memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2); - *(__be32 *)work->response_buf = - cpu_to_be32(conn->vals->header_size); rsp_hdr->ProtocolId = rcv_hdr->ProtocolId; rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE; rsp_hdr->Command = rcv_hdr->Command; @@ -657,7 +660,7 @@ int setup_async_work(struct ksmbd_work *work, void (*fn)(void **), void **arg) struct ksmbd_conn *conn = work->conn; int id; - rsp_hdr = smb2_get_msg(work->response_buf); + rsp_hdr = ksmbd_resp_buf_next(work); rsp_hdr->Flags |= SMB2_FLAGS_ASYNC_COMMAND; id = ksmbd_acquire_async_msg_id(&conn->async_ida); @@ -706,15 +709,24 @@ void release_async_work(struct ksmbd_work *work) void smb2_send_interim_resp(struct ksmbd_work *work, __le32 status) { struct smb2_hdr *rsp_hdr; + struct ksmbd_work *in_work = ksmbd_alloc_work_struct(); - rsp_hdr = smb2_get_msg(work->response_buf); - smb2_set_err_rsp(work); + if (allocate_interim_rsp_buf(in_work)) { + pr_err("smb_allocate_rsp_buf failed!\n"); + ksmbd_free_work_struct(in_work); + return; + } + + in_work->conn = work->conn; + memcpy(smb2_get_msg(in_work->response_buf), ksmbd_resp_buf_next(work), + __SMB2_HEADER_STRUCTURE_SIZE); + + rsp_hdr = smb2_get_msg(in_work->response_buf); + smb2_set_err_rsp(in_work); rsp_hdr->Status = status; - work->multiRsp = 1; - ksmbd_conn_write(work); - rsp_hdr->Status = 0; - work->multiRsp = 0; + ksmbd_conn_write(in_work); + ksmbd_free_work_struct(in_work); } static __le32 smb2_get_reparse_tag_special_file(umode_t mode) @@ -821,9 +833,8 @@ static void build_posix_ctxt(struct smb2_posix_neg_context *pneg_ctxt) pneg_ctxt->Name[15] = 0x7C; } -static void assemble_neg_contexts(struct ksmbd_conn *conn, - struct smb2_negotiate_rsp *rsp, - void *smb2_buf_len) +static unsigned int assemble_neg_contexts(struct ksmbd_conn *conn, + struct smb2_negotiate_rsp *rsp) { char * const pneg_ctxt = (char *)rsp + le32_to_cpu(rsp->NegotiateContextOffset); @@ -834,7 +845,6 @@ static void assemble_neg_contexts(struct ksmbd_conn *conn, "assemble SMB2_PREAUTH_INTEGRITY_CAPABILITIES context\n"); build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt, conn->preauth_info->Preauth_HashId); - inc_rfc1001_len(smb2_buf_len, AUTH_GSS_PADDING); ctxt_size = sizeof(struct smb2_preauth_neg_context); if (conn->cipher_type) { @@ -874,7 +884,7 @@ static void assemble_neg_contexts(struct ksmbd_conn *conn, } rsp->NegotiateContextCount = cpu_to_le16(neg_ctxt_cnt); - inc_rfc1001_len(smb2_buf_len, ctxt_size); + return ctxt_size + AUTH_GSS_PADDING; } static __le32 decode_preauth_ctxt(struct ksmbd_conn *conn, @@ -1090,7 +1100,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) struct smb2_negotiate_req *req = smb2_get_msg(work->request_buf); struct smb2_negotiate_rsp *rsp = smb2_get_msg(work->response_buf); int rc = 0; - unsigned int smb2_buf_len, smb2_neg_size; + unsigned int smb2_buf_len, smb2_neg_size, neg_ctxt_len = 0; __le32 status; ksmbd_debug(SMB, "Received negotiate request\n"); @@ -1183,7 +1193,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) conn->preauth_info->Preauth_HashValue); rsp->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT); - assemble_neg_contexts(conn, rsp, work->response_buf); + neg_ctxt_len = assemble_neg_contexts(conn, rsp); break; case SMB302_PROT_ID: init_smb3_02_server(conn); @@ -1233,8 +1243,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) rsp->SecurityBufferLength = cpu_to_le16(AUTH_GSS_LENGTH); ksmbd_copy_gss_neg_header((char *)(&rsp->hdr) + le16_to_cpu(rsp->SecurityBufferOffset)); - inc_rfc1001_len(work->response_buf, sizeof(struct smb2_negotiate_rsp) - - sizeof(struct smb2_hdr) + AUTH_GSS_LENGTH); + rsp->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED_LE; conn->use_spnego = true; @@ -1252,9 +1261,15 @@ int smb2_handle_negotiate(struct ksmbd_work *work) ksmbd_conn_set_need_negotiate(conn); err_out: + if (rc) + rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; + + if (!rc) + rc = ksmbd_iov_pin_rsp(work, rsp, + sizeof(struct smb2_negotiate_rsp) + + AUTH_GSS_LENGTH + neg_ctxt_len); if (rc < 0) smb2_set_err_rsp(work); - return rc; } @@ -1454,7 +1469,6 @@ static int ntlm_authenticate(struct ksmbd_work *work, memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len); rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len); kfree(spnego_blob); - inc_rfc1001_len(work->response_buf, spnego_blob_len - 1); } user = session_user(conn, req); @@ -1600,7 +1614,6 @@ static int krb5_authenticate(struct ksmbd_work *work, return -EINVAL; } rsp->SecurityBufferLength = cpu_to_le16(out_len); - inc_rfc1001_len(work->response_buf, out_len - 1); if ((conn->sign || server_conf.enforced_signing) || (req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED)) @@ -1672,7 +1685,6 @@ int smb2_sess_setup(struct ksmbd_work *work) rsp->SessionFlags = 0; rsp->SecurityBufferOffset = cpu_to_le16(72); rsp->SecurityBufferLength = 0; - inc_rfc1001_len(work->response_buf, 9); ksmbd_conn_lock(conn); if (!req->hdr.SessionId) { @@ -1808,13 +1820,6 @@ int smb2_sess_setup(struct ksmbd_work *work) goto out_err; rsp->hdr.Status = STATUS_MORE_PROCESSING_REQUIRED; - /* - * Note: here total size -1 is done as an - * adjustment for 0 size blob - */ - inc_rfc1001_len(work->response_buf, - le16_to_cpu(rsp->SecurityBufferLength) - 1); - } else if (negblob->MessageType == NtLmAuthenticate) { rc = ntlm_authenticate(work, req, rsp); if (rc) @@ -1899,6 +1904,18 @@ out_err: ksmbd_conn_set_need_negotiate(conn); } } + smb2_set_err_rsp(work); + } else { + unsigned int iov_len; + + if (rsp->SecurityBufferLength) + iov_len = offsetof(struct smb2_sess_setup_rsp, Buffer) + + le16_to_cpu(rsp->SecurityBufferLength); + else + iov_len = sizeof(struct smb2_sess_setup_rsp); + rc = ksmbd_iov_pin_rsp(work, rsp, iov_len); + if (rc) + rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; } ksmbd_conn_unlock(conn); @@ -1977,13 +1994,16 @@ int smb2_tree_connect(struct ksmbd_work *work) status.tree_conn->posix_extensions = true; rsp->StructureSize = cpu_to_le16(16); - inc_rfc1001_len(work->response_buf, 16); out_err1: rsp->Capabilities = 0; rsp->Reserved = 0; /* default manual caching */ rsp->ShareFlags = SMB2_SHAREFLAG_MANUAL_CACHING; + rc = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_tree_connect_rsp)); + if (rc) + status.ret = KSMBD_TREE_CONN_STATUS_NOMEM; + if (!IS_ERR(treename)) kfree(treename); if (!IS_ERR(name)) @@ -2096,20 +2116,27 @@ int smb2_tree_disconnect(struct ksmbd_work *work) struct smb2_tree_disconnect_req *req; struct ksmbd_session *sess = work->sess; struct ksmbd_tree_connect *tcon = work->tcon; + int err; WORK_BUFFERS(work, req, rsp); - rsp->StructureSize = cpu_to_le16(4); - inc_rfc1001_len(work->response_buf, 4); - ksmbd_debug(SMB, "request\n"); + rsp->StructureSize = cpu_to_le16(4); + err = ksmbd_iov_pin_rsp(work, rsp, + sizeof(struct smb2_tree_disconnect_rsp)); + if (err) { + rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; + smb2_set_err_rsp(work); + return err; + } + if (!tcon || test_and_set_bit(TREE_CONN_EXPIRE, &tcon->status)) { ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId); rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; smb2_set_err_rsp(work); - return 0; + return -ENOENT; } ksmbd_close_tree_conn_fds(work); @@ -2131,15 +2158,21 @@ int smb2_session_logoff(struct ksmbd_work *work) struct smb2_logoff_rsp *rsp; struct ksmbd_session *sess; u64 sess_id; + int err; WORK_BUFFERS(work, req, rsp); + ksmbd_debug(SMB, "request\n"); + sess_id = le64_to_cpu(req->hdr.SessionId); rsp->StructureSize = cpu_to_le16(4); - inc_rfc1001_len(work->response_buf, 4); - - ksmbd_debug(SMB, "request\n"); + err = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_logoff_rsp)); + if (err) { + rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; + smb2_set_err_rsp(work); + return err; + } ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_RECONNECT); ksmbd_close_session_fds(work); @@ -2154,7 +2187,7 @@ int smb2_session_logoff(struct ksmbd_work *work) ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId); rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; smb2_set_err_rsp(work); - return 0; + return -ENOENT; } ksmbd_destroy_file_table(&sess->file_table); @@ -2215,7 +2248,10 @@ static noinline int create_smb2_pipe(struct ksmbd_work *work) rsp->CreateContextsOffset = 0; rsp->CreateContextsLength = 0; - inc_rfc1001_len(work->response_buf, 88); /* StructureSize - 1*/ + err = ksmbd_iov_pin_rsp(work, rsp, offsetof(struct smb2_create_rsp, Buffer)); + if (err) + goto out; + kfree(name); return 0; @@ -2597,6 +2633,7 @@ int smb2_open(struct ksmbd_work *work) u64 time; umode_t posix_mode = 0; __le32 daccess, maximal_access = 0; + int iov_len = 0; WORK_BUFFERS(work, req, rsp); @@ -3248,7 +3285,7 @@ int smb2_open(struct ksmbd_work *work) rsp->CreateContextsOffset = 0; rsp->CreateContextsLength = 0; - inc_rfc1001_len(work->response_buf, 88); /* StructureSize - 1*/ + iov_len = offsetof(struct smb2_create_rsp, Buffer); /* If lease is request send lease context response */ if (opinfo && opinfo->is_lease) { @@ -3263,8 +3300,7 @@ int smb2_open(struct ksmbd_work *work) create_lease_buf(rsp->Buffer, opinfo->o_lease); le32_add_cpu(&rsp->CreateContextsLength, conn->vals->create_lease_size); - inc_rfc1001_len(work->response_buf, - conn->vals->create_lease_size); + iov_len += conn->vals->create_lease_size; next_ptr = &lease_ccontext->Next; next_off = conn->vals->create_lease_size; } @@ -3284,8 +3320,7 @@ int smb2_open(struct ksmbd_work *work) le32_to_cpu(maximal_access)); le32_add_cpu(&rsp->CreateContextsLength, conn->vals->create_mxac_size); - inc_rfc1001_len(work->response_buf, - conn->vals->create_mxac_size); + iov_len += conn->vals->create_mxac_size; if (next_ptr) *next_ptr = cpu_to_le32(next_off); next_ptr = &mxac_ccontext->Next; @@ -3303,8 +3338,7 @@ int smb2_open(struct ksmbd_work *work) stat.ino, tcon->id); le32_add_cpu(&rsp->CreateContextsLength, conn->vals->create_disk_id_size); - inc_rfc1001_len(work->response_buf, - conn->vals->create_disk_id_size); + iov_len += conn->vals->create_disk_id_size; if (next_ptr) *next_ptr = cpu_to_le32(next_off); next_ptr = &disk_id_ccontext->Next; @@ -3318,8 +3352,7 @@ int smb2_open(struct ksmbd_work *work) fp); le32_add_cpu(&rsp->CreateContextsLength, conn->vals->create_posix_size); - inc_rfc1001_len(work->response_buf, - conn->vals->create_posix_size); + iov_len += conn->vals->create_posix_size; if (next_ptr) *next_ptr = cpu_to_le32(next_off); } @@ -3337,7 +3370,8 @@ err_out: } ksmbd_revert_fsids(work); err_out1: - + if (!rc) + rc = ksmbd_iov_pin_rsp(work, (void *)rsp, iov_len); if (rc) { if (rc == -EINVAL) rsp->hdr.Status = STATUS_INVALID_PARAMETER; @@ -4063,7 +4097,10 @@ int smb2_query_dir(struct ksmbd_work *work) rsp->OutputBufferOffset = cpu_to_le16(0); rsp->OutputBufferLength = cpu_to_le32(0); rsp->Buffer[0] = 0; - inc_rfc1001_len(work->response_buf, 9); + rc = ksmbd_iov_pin_rsp(work, (void *)rsp, + sizeof(struct smb2_query_directory_rsp)); + if (rc) + goto err_out; } else { no_buf_len: ((struct file_directory_info *) @@ -4075,7 +4112,11 @@ no_buf_len: rsp->StructureSize = cpu_to_le16(9); rsp->OutputBufferOffset = cpu_to_le16(72); rsp->OutputBufferLength = cpu_to_le32(d_info.data_count); - inc_rfc1001_len(work->response_buf, 8 + d_info.data_count); + rc = ksmbd_iov_pin_rsp(work, (void *)rsp, + offsetof(struct smb2_query_directory_rsp, Buffer) + + d_info.data_count); + if (rc) + goto err_out; } kfree(srch_ptr); @@ -4116,27 +4157,18 @@ err_out2: * @reqOutputBufferLength: max buffer length expected in command response * @rsp: query info response buffer contains output buffer length * @rsp_org: base response buffer pointer in case of chained response - * @infoclass_size: query info class response buffer size * * Return: 0 on success, otherwise error */ static int buffer_check_err(int reqOutputBufferLength, struct smb2_query_info_rsp *rsp, - void *rsp_org, int infoclass_size) + void *rsp_org) { if (reqOutputBufferLength < le32_to_cpu(rsp->OutputBufferLength)) { - if (reqOutputBufferLength < infoclass_size) { - pr_err("Invalid Buffer Size Requested\n"); - rsp->hdr.Status = STATUS_INFO_LENGTH_MISMATCH; - *(__be32 *)rsp_org = cpu_to_be32(sizeof(struct smb2_hdr)); - return -EINVAL; - } - - ksmbd_debug(SMB, "Buffer Overflow\n"); - rsp->hdr.Status = STATUS_BUFFER_OVERFLOW; - *(__be32 *)rsp_org = cpu_to_be32(sizeof(struct smb2_hdr) + - reqOutputBufferLength); - rsp->OutputBufferLength = cpu_to_le32(reqOutputBufferLength); + pr_err("Invalid Buffer Size Requested\n"); + rsp->hdr.Status = STATUS_INFO_LENGTH_MISMATCH; + *(__be32 *)rsp_org = cpu_to_be32(sizeof(struct smb2_hdr)); + return -EINVAL; } return 0; } @@ -4155,7 +4187,6 @@ static void get_standard_info_pipe(struct smb2_query_info_rsp *rsp, sinfo->Directory = 0; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_standard_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_standard_info)); } static void get_internal_info_pipe(struct smb2_query_info_rsp *rsp, u64 num, @@ -4169,7 +4200,6 @@ static void get_internal_info_pipe(struct smb2_query_info_rsp *rsp, u64 num, file_info->IndexNumber = cpu_to_le64(num | (1ULL << 63)); rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_internal_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_internal_info)); } static int smb2_get_info_file_pipe(struct ksmbd_session *sess, @@ -4195,14 +4225,12 @@ static int smb2_get_info_file_pipe(struct ksmbd_session *sess, case FILE_STANDARD_INFORMATION: get_standard_info_pipe(rsp, rsp_org); rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength), - rsp, rsp_org, - FILE_STANDARD_INFORMATION_SIZE); + rsp, rsp_org); break; case FILE_INTERNAL_INFORMATION: get_internal_info_pipe(rsp, id, rsp_org); rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength), - rsp, rsp_org, - FILE_INTERNAL_INFORMATION_SIZE); + rsp, rsp_org); break; default: ksmbd_debug(SMB, "smb2_info_file_pipe for %u not supported\n", @@ -4308,7 +4336,7 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp, if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) name_len -= XATTR_USER_PREFIX_LEN; - ptr = (char *)(&eainfo->name + name_len + 1); + ptr = eainfo->name + name_len + 1; buf_free_len -= (offsetof(struct smb2_ea_info, name) + name_len + 1); /* bailout if xattr can't fit in buf_free_len */ @@ -4370,7 +4398,6 @@ done: if (rsp_data_cnt == 0) rsp->hdr.Status = STATUS_NO_EAS_ON_FILE; rsp->OutputBufferLength = cpu_to_le32(rsp_data_cnt); - inc_rfc1001_len(rsp_org, rsp_data_cnt); out: kvfree(xattr_list); return rc; @@ -4385,7 +4412,6 @@ static void get_file_access_info(struct smb2_query_info_rsp *rsp, file_info->AccessFlags = fp->daccess; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_access_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_access_info)); } static int get_file_basic_info(struct smb2_query_info_rsp *rsp, @@ -4402,8 +4428,8 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp, } basic_info = (struct smb2_file_basic_info *)rsp->Buffer; - generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), - &stat); + generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, + file_inode(fp->filp), &stat); basic_info->CreationTime = cpu_to_le64(fp->create_time); time = ksmbd_UnixTimeToNT(stat.atime); basic_info->LastAccessTime = cpu_to_le64(time); @@ -4415,7 +4441,6 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp, basic_info->Pad1 = 0; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_basic_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_basic_info)); return 0; } @@ -4428,7 +4453,7 @@ static void get_file_standard_info(struct smb2_query_info_rsp *rsp, struct kstat stat; inode = file_inode(fp->filp); - generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat); + generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat); sinfo = (struct smb2_file_standard_info *)rsp->Buffer; delete_pending = ksmbd_inode_pending_delete(fp); @@ -4440,8 +4465,6 @@ static void get_file_standard_info(struct smb2_query_info_rsp *rsp, sinfo->Directory = S_ISDIR(stat.mode) ? 1 : 0; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_standard_info)); - inc_rfc1001_len(rsp_org, - sizeof(struct smb2_file_standard_info)); } static void get_file_alignment_info(struct smb2_query_info_rsp *rsp, @@ -4453,8 +4476,6 @@ static void get_file_alignment_info(struct smb2_query_info_rsp *rsp, file_info->AlignmentRequirement = 0; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_alignment_info)); - inc_rfc1001_len(rsp_org, - sizeof(struct smb2_file_alignment_info)); } static int get_file_all_info(struct ksmbd_work *work, @@ -4482,7 +4503,7 @@ static int get_file_all_info(struct ksmbd_work *work, return PTR_ERR(filename); inode = file_inode(fp->filp); - generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat); + generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat); ksmbd_debug(SMB, "filename = %s\n", filename); delete_pending = ksmbd_inode_pending_delete(fp); @@ -4518,7 +4539,6 @@ static int get_file_all_info(struct ksmbd_work *work, rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_all_info) + conv_len - 1); kfree(filename); - inc_rfc1001_len(rsp_org, le32_to_cpu(rsp->OutputBufferLength)); return 0; } @@ -4541,7 +4561,6 @@ static void get_file_alternate_info(struct ksmbd_work *work, file_info->FileNameLength = cpu_to_le32(conv_len); rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_alt_name_info) + conv_len); - inc_rfc1001_len(rsp_org, le32_to_cpu(rsp->OutputBufferLength)); } static void get_file_stream_info(struct ksmbd_work *work, @@ -4559,8 +4578,8 @@ static void get_file_stream_info(struct ksmbd_work *work, int buf_free_len; struct smb2_query_info_req *req = ksmbd_req_buf_next(work); - generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), - &stat); + generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, + file_inode(fp->filp), &stat); file_info = (struct smb2_file_stream_info *)rsp->Buffer; buf_free_len = @@ -4641,7 +4660,6 @@ out: kvfree(xattr_list); rsp->OutputBufferLength = cpu_to_le32(nbytes); - inc_rfc1001_len(rsp_org, nbytes); } static void get_file_internal_info(struct smb2_query_info_rsp *rsp, @@ -4650,13 +4668,12 @@ static void get_file_internal_info(struct smb2_query_info_rsp *rsp, struct smb2_file_internal_info *file_info; struct kstat stat; - generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), - &stat); + generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, + file_inode(fp->filp), &stat); file_info = (struct smb2_file_internal_info *)rsp->Buffer; file_info->IndexNumber = cpu_to_le64(stat.ino); rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_internal_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_internal_info)); } static int get_file_network_open_info(struct smb2_query_info_rsp *rsp, @@ -4676,7 +4693,7 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp, file_info = (struct smb2_file_ntwrk_info *)rsp->Buffer; inode = file_inode(fp->filp); - generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat); + generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat); file_info->CreationTime = cpu_to_le64(fp->create_time); time = ksmbd_UnixTimeToNT(stat.atime); @@ -4692,7 +4709,6 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp, file_info->Reserved = cpu_to_le32(0); rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_ntwrk_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_ntwrk_info)); return 0; } @@ -4704,7 +4720,6 @@ static void get_file_ea_info(struct smb2_query_info_rsp *rsp, void *rsp_org) file_info->EASize = 0; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_ea_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_ea_info)); } static void get_file_position_info(struct smb2_query_info_rsp *rsp, @@ -4716,7 +4731,6 @@ static void get_file_position_info(struct smb2_query_info_rsp *rsp, file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos); rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_pos_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_pos_info)); } static void get_file_mode_info(struct smb2_query_info_rsp *rsp, @@ -4728,7 +4742,6 @@ static void get_file_mode_info(struct smb2_query_info_rsp *rsp, file_info->Mode = fp->coption & FILE_MODE_INFO_MASK; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_mode_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_mode_info)); } static void get_file_compression_info(struct smb2_query_info_rsp *rsp, @@ -4737,8 +4750,8 @@ static void get_file_compression_info(struct smb2_query_info_rsp *rsp, struct smb2_file_comp_info *file_info; struct kstat stat; - generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), - &stat); + generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, + file_inode(fp->filp), &stat); file_info = (struct smb2_file_comp_info *)rsp->Buffer; file_info->CompressedFileSize = cpu_to_le64(stat.blocks << 9); @@ -4750,7 +4763,6 @@ static void get_file_compression_info(struct smb2_query_info_rsp *rsp, rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_comp_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_comp_info)); } static int get_file_attribute_tag_info(struct smb2_query_info_rsp *rsp, @@ -4769,11 +4781,10 @@ static int get_file_attribute_tag_info(struct smb2_query_info_rsp *rsp, file_info->ReparseTag = 0; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_attr_tag_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_attr_tag_info)); return 0; } -static int find_file_posix_info(struct smb2_query_info_rsp *rsp, +static void find_file_posix_info(struct smb2_query_info_rsp *rsp, struct ksmbd_file *fp, void *rsp_org) { struct smb311_posix_qinfo *file_info; @@ -4790,7 +4801,7 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp, file_info->LastAccessTime = cpu_to_le64(time); time = ksmbd_UnixTimeToNT(inode->i_mtime); file_info->LastWriteTime = cpu_to_le64(time); - time = ksmbd_UnixTimeToNT(inode->i_ctime); + time = ksmbd_UnixTimeToNT(inode_get_ctime(inode)); file_info->ChangeTime = cpu_to_le64(time); file_info->DosAttributes = fp->f_ci->m_fattr; file_info->Inode = cpu_to_le64(inode->i_ino); @@ -4811,8 +4822,6 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp, SIDUNIX_GROUP, (struct smb_sid *)&file_info->Sids[16]); rsp->OutputBufferLength = cpu_to_le32(out_buf_len); - inc_rfc1001_len(rsp_org, out_buf_len); - return out_buf_len; } static int smb2_get_info_file(struct ksmbd_work *work, @@ -4822,7 +4831,6 @@ static int smb2_get_info_file(struct ksmbd_work *work, struct ksmbd_file *fp; int fileinfoclass = 0; int rc = 0; - int file_infoclass_size; unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID; if (test_share_config_flag(work->tcon->share_conf, @@ -4855,85 +4863,69 @@ static int smb2_get_info_file(struct ksmbd_work *work, switch (fileinfoclass) { case FILE_ACCESS_INFORMATION: get_file_access_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_ACCESS_INFORMATION_SIZE; break; case FILE_BASIC_INFORMATION: rc = get_file_basic_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_BASIC_INFORMATION_SIZE; break; case FILE_STANDARD_INFORMATION: get_file_standard_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_STANDARD_INFORMATION_SIZE; break; case FILE_ALIGNMENT_INFORMATION: get_file_alignment_info(rsp, work->response_buf); - file_infoclass_size = FILE_ALIGNMENT_INFORMATION_SIZE; break; case FILE_ALL_INFORMATION: rc = get_file_all_info(work, rsp, fp, work->response_buf); - file_infoclass_size = FILE_ALL_INFORMATION_SIZE; break; case FILE_ALTERNATE_NAME_INFORMATION: get_file_alternate_info(work, rsp, fp, work->response_buf); - file_infoclass_size = FILE_ALTERNATE_NAME_INFORMATION_SIZE; break; case FILE_STREAM_INFORMATION: get_file_stream_info(work, rsp, fp, work->response_buf); - file_infoclass_size = FILE_STREAM_INFORMATION_SIZE; break; case FILE_INTERNAL_INFORMATION: get_file_internal_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_INTERNAL_INFORMATION_SIZE; break; case FILE_NETWORK_OPEN_INFORMATION: rc = get_file_network_open_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_NETWORK_OPEN_INFORMATION_SIZE; break; case FILE_EA_INFORMATION: get_file_ea_info(rsp, work->response_buf); - file_infoclass_size = FILE_EA_INFORMATION_SIZE; break; case FILE_FULL_EA_INFORMATION: rc = smb2_get_ea(work, fp, req, rsp, work->response_buf); - file_infoclass_size = FILE_FULL_EA_INFORMATION_SIZE; break; case FILE_POSITION_INFORMATION: get_file_position_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_POSITION_INFORMATION_SIZE; break; case FILE_MODE_INFORMATION: get_file_mode_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_MODE_INFORMATION_SIZE; break; case FILE_COMPRESSION_INFORMATION: get_file_compression_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_COMPRESSION_INFORMATION_SIZE; break; case FILE_ATTRIBUTE_TAG_INFORMATION: rc = get_file_attribute_tag_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_ATTRIBUTE_TAG_INFORMATION_SIZE; break; case SMB_FIND_FILE_POSIX_INFO: if (!work->tcon->posix_extensions) { pr_err("client doesn't negotiate with SMB3.1.1 POSIX Extensions\n"); rc = -EOPNOTSUPP; } else { - file_infoclass_size = find_file_posix_info(rsp, fp, - work->response_buf); + find_file_posix_info(rsp, fp, work->response_buf); } break; default: @@ -4943,8 +4935,7 @@ static int smb2_get_info_file(struct ksmbd_work *work, } if (!rc) rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength), - rsp, work->response_buf, - file_infoclass_size); + rsp, work->response_buf); ksmbd_fd_put(work, fp); return rc; } @@ -4960,7 +4951,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, struct kstatfs stfs; struct path path; int rc = 0, len; - int fs_infoclass_size = 0; if (!share->path) return -EIO; @@ -4990,8 +4980,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->DeviceType = cpu_to_le32(stfs.f_type); info->DeviceCharacteristics = cpu_to_le32(0x00000020); rsp->OutputBufferLength = cpu_to_le32(8); - inc_rfc1001_len(work->response_buf, 8); - fs_infoclass_size = FS_DEVICE_INFORMATION_SIZE; break; } case FS_ATTRIBUTE_INFORMATION: @@ -5020,8 +5008,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->FileSystemNameLen = cpu_to_le32(len); sz = sizeof(struct filesystem_attribute_info) - 2 + len; rsp->OutputBufferLength = cpu_to_le32(sz); - inc_rfc1001_len(work->response_buf, sz); - fs_infoclass_size = FS_ATTRIBUTE_INFORMATION_SIZE; break; } case FS_VOLUME_INFORMATION: @@ -5048,8 +5034,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->Reserved = 0; sz = sizeof(struct filesystem_vol_info) - 2 + len; rsp->OutputBufferLength = cpu_to_le32(sz); - inc_rfc1001_len(work->response_buf, sz); - fs_infoclass_size = FS_VOLUME_INFORMATION_SIZE; break; } case FS_SIZE_INFORMATION: @@ -5062,8 +5046,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->SectorsPerAllocationUnit = cpu_to_le32(1); info->BytesPerSector = cpu_to_le32(stfs.f_bsize); rsp->OutputBufferLength = cpu_to_le32(24); - inc_rfc1001_len(work->response_buf, 24); - fs_infoclass_size = FS_SIZE_INFORMATION_SIZE; break; } case FS_FULL_SIZE_INFORMATION: @@ -5079,8 +5061,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->SectorsPerAllocationUnit = cpu_to_le32(1); info->BytesPerSector = cpu_to_le32(stfs.f_bsize); rsp->OutputBufferLength = cpu_to_le32(32); - inc_rfc1001_len(work->response_buf, 32); - fs_infoclass_size = FS_FULL_SIZE_INFORMATION_SIZE; break; } case FS_OBJECT_ID_INFORMATION: @@ -5100,8 +5080,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->extended_info.rel_date = 0; memcpy(info->extended_info.version_string, "1.1.0", strlen("1.1.0")); rsp->OutputBufferLength = cpu_to_le32(64); - inc_rfc1001_len(work->response_buf, 64); - fs_infoclass_size = FS_OBJECT_ID_INFORMATION_SIZE; break; } case FS_SECTOR_SIZE_INFORMATION: @@ -5123,8 +5101,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->ByteOffsetForSectorAlignment = 0; info->ByteOffsetForPartitionAlignment = 0; rsp->OutputBufferLength = cpu_to_le32(28); - inc_rfc1001_len(work->response_buf, 28); - fs_infoclass_size = FS_SECTOR_SIZE_INFORMATION_SIZE; break; } case FS_CONTROL_INFORMATION: @@ -5145,8 +5121,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->DefaultQuotaLimit = cpu_to_le64(SMB2_NO_FID); info->Padding = 0; rsp->OutputBufferLength = cpu_to_le32(48); - inc_rfc1001_len(work->response_buf, 48); - fs_infoclass_size = FS_CONTROL_INFORMATION_SIZE; break; } case FS_POSIX_INFORMATION: @@ -5166,8 +5140,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->TotalFileNodes = cpu_to_le64(stfs.f_files); info->FreeFileNodes = cpu_to_le64(stfs.f_ffree); rsp->OutputBufferLength = cpu_to_le32(56); - inc_rfc1001_len(work->response_buf, 56); - fs_infoclass_size = FS_POSIX_INFORMATION_SIZE; } break; } @@ -5176,8 +5148,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, return -EOPNOTSUPP; } rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength), - rsp, work->response_buf, - fs_infoclass_size); + rsp, work->response_buf); path_put(&path); return rc; } @@ -5211,7 +5182,6 @@ static int smb2_get_info_sec(struct ksmbd_work *work, secdesclen = sizeof(struct smb_ntsd); rsp->OutputBufferLength = cpu_to_le32(secdesclen); - inc_rfc1001_len(work->response_buf, secdesclen); return 0; } @@ -5256,7 +5226,6 @@ static int smb2_get_info_sec(struct ksmbd_work *work, return rc; rsp->OutputBufferLength = cpu_to_le32(secdesclen); - inc_rfc1001_len(work->response_buf, secdesclen); return 0; } @@ -5295,6 +5264,14 @@ int smb2_query_info(struct ksmbd_work *work) rc = -EOPNOTSUPP; } + if (!rc) { + rsp->StructureSize = cpu_to_le16(9); + rsp->OutputBufferOffset = cpu_to_le16(72); + rc = ksmbd_iov_pin_rsp(work, (void *)rsp, + offsetof(struct smb2_query_info_rsp, Buffer) + + le32_to_cpu(rsp->OutputBufferLength)); + } + if (rc < 0) { if (rc == -EACCES) rsp->hdr.Status = STATUS_ACCESS_DENIED; @@ -5302,6 +5279,8 @@ int smb2_query_info(struct ksmbd_work *work) rsp->hdr.Status = STATUS_FILE_CLOSED; else if (rc == -EIO) rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR; + else if (rc == -ENOMEM) + rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; else if (rc == -EOPNOTSUPP || rsp->hdr.Status == 0) rsp->hdr.Status = STATUS_INVALID_INFO_CLASS; smb2_set_err_rsp(work); @@ -5310,9 +5289,6 @@ int smb2_query_info(struct ksmbd_work *work) rc); return rc; } - rsp->StructureSize = cpu_to_le16(9); - rsp->OutputBufferOffset = cpu_to_le16(72); - inc_rfc1001_len(work->response_buf, 8); return 0; } @@ -5343,8 +5319,9 @@ static noinline int smb2_close_pipe(struct ksmbd_work *work) rsp->AllocationSize = 0; rsp->EndOfFile = 0; rsp->Attributes = 0; - inc_rfc1001_len(work->response_buf, 60); - return 0; + + return ksmbd_iov_pin_rsp(work, (void *)rsp, + sizeof(struct smb2_close_rsp)); } /** @@ -5433,7 +5410,7 @@ int smb2_close(struct ksmbd_work *work) rsp->LastAccessTime = cpu_to_le64(time); time = ksmbd_UnixTimeToNT(inode->i_mtime); rsp->LastWriteTime = cpu_to_le64(time); - time = ksmbd_UnixTimeToNT(inode->i_ctime); + time = ksmbd_UnixTimeToNT(inode_get_ctime(inode)); rsp->ChangeTime = cpu_to_le64(time); ksmbd_fd_put(work, fp); } else { @@ -5449,15 +5426,17 @@ int smb2_close(struct ksmbd_work *work) err = ksmbd_close_fd(work, volatile_id); out: + if (!err) + err = ksmbd_iov_pin_rsp(work, (void *)rsp, + sizeof(struct smb2_close_rsp)); + if (err) { if (rsp->hdr.Status == 0) rsp->hdr.Status = STATUS_FILE_CLOSED; smb2_set_err_rsp(work); - } else { - inc_rfc1001_len(work->response_buf, 60); } - return 0; + return err; } /** @@ -5475,8 +5454,7 @@ int smb2_echo(struct ksmbd_work *work) rsp->StructureSize = cpu_to_le16(4); rsp->Reserved = 0; - inc_rfc1001_len(work->response_buf, 4); - return 0; + return ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_echo_rsp)); } static int smb2_rename(struct ksmbd_work *work, @@ -5656,7 +5634,7 @@ static int set_file_basic_info(struct ksmbd_file *fp, if (file_info->ChangeTime) attrs.ia_ctime = ksmbd_NTtimeToUnix(file_info->ChangeTime); else - attrs.ia_ctime = inode->i_ctime; + attrs.ia_ctime = inode_get_ctime(inode); if (file_info->LastWriteTime) { attrs.ia_mtime = ksmbd_NTtimeToUnix(file_info->LastWriteTime); @@ -5701,7 +5679,7 @@ static int set_file_basic_info(struct ksmbd_file *fp, return -EACCES; inode_lock(inode); - inode->i_ctime = attrs.ia_ctime; + inode_set_ctime_to_ts(inode, attrs.ia_ctime); attrs.ia_valid &= ~ATTR_CTIME; rc = notify_change(idmap, dentry, &attrs, NULL); inode_unlock(inode); @@ -6068,7 +6046,10 @@ int smb2_set_info(struct ksmbd_work *work) goto err_out; rsp->StructureSize = cpu_to_le16(2); - inc_rfc1001_len(work->response_buf, 2); + rc = ksmbd_iov_pin_rsp(work, (void *)rsp, + sizeof(struct smb2_set_info_rsp)); + if (rc) + goto err_out; ksmbd_fd_put(work, fp); return 0; @@ -6115,28 +6096,36 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work) id = req->VolatileFileId; - inc_rfc1001_len(work->response_buf, 16); rpc_resp = ksmbd_rpc_read(work->sess, id); if (rpc_resp) { + void *aux_payload_buf; + if (rpc_resp->flags != KSMBD_RPC_OK) { err = -EINVAL; goto out; } - work->aux_payload_buf = + aux_payload_buf = kvmalloc(rpc_resp->payload_sz, GFP_KERNEL); - if (!work->aux_payload_buf) { + if (!aux_payload_buf) { err = -ENOMEM; goto out; } - memcpy(work->aux_payload_buf, rpc_resp->payload, - rpc_resp->payload_sz); + memcpy(aux_payload_buf, rpc_resp->payload, rpc_resp->payload_sz); nbytes = rpc_resp->payload_sz; - work->resp_hdr_sz = get_rfc1002_len(work->response_buf) + 4; - work->aux_payload_sz = nbytes; kvfree(rpc_resp); + err = ksmbd_iov_pin_rsp_read(work, (void *)rsp, + offsetof(struct smb2_read_rsp, Buffer), + aux_payload_buf, nbytes); + if (err) + goto out; + } else { + err = ksmbd_iov_pin_rsp(work, (void *)rsp, + offsetof(struct smb2_read_rsp, Buffer)); + if (err) + goto out; } rsp->StructureSize = cpu_to_le16(17); @@ -6145,7 +6134,6 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work) rsp->DataLength = cpu_to_le32(nbytes); rsp->DataRemaining = 0; rsp->Flags = 0; - inc_rfc1001_len(work->response_buf, nbytes); return 0; out: @@ -6219,13 +6207,8 @@ int smb2_read(struct ksmbd_work *work) int err = 0; bool is_rdma_channel = false; unsigned int max_read_size = conn->vals->max_read_size; - - WORK_BUFFERS(work, req, rsp); - if (work->next_smb2_rcv_hdr_off) { - work->send_no_response = 1; - err = -EOPNOTSUPP; - goto out; - } + unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID; + void *aux_payload_buf; if (test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_PIPE)) { @@ -6233,6 +6216,25 @@ int smb2_read(struct ksmbd_work *work) return smb2_read_pipe(work); } + if (work->next_smb2_rcv_hdr_off) { + req = ksmbd_req_buf_next(work); + rsp = ksmbd_resp_buf_next(work); + if (!has_file_id(req->VolatileFileId)) { + ksmbd_debug(SMB, "Compound request set FID = %llu\n", + work->compound_fid); + id = work->compound_fid; + pid = work->compound_pfid; + } + } else { + req = smb2_get_msg(work->request_buf); + rsp = smb2_get_msg(work->response_buf); + } + + if (!has_file_id(id)) { + id = req->VolatileFileId; + pid = req->PersistentFileId; + } + if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE || req->Channel == SMB2_CHANNEL_RDMA_V1) { is_rdma_channel = true; @@ -6255,7 +6257,7 @@ int smb2_read(struct ksmbd_work *work) goto out; } - fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId); + fp = ksmbd_lookup_fd_slow(work, id, pid); if (!fp) { err = -ENOENT; goto out; @@ -6281,21 +6283,20 @@ int smb2_read(struct ksmbd_work *work) ksmbd_debug(SMB, "filename %pD, offset %lld, len %zu\n", fp->filp, offset, length); - work->aux_payload_buf = kvzalloc(length, GFP_KERNEL); - if (!work->aux_payload_buf) { + aux_payload_buf = kvzalloc(length, GFP_KERNEL); + if (!aux_payload_buf) { err = -ENOMEM; goto out; } - nbytes = ksmbd_vfs_read(work, fp, length, &offset); + nbytes = ksmbd_vfs_read(work, fp, length, &offset, aux_payload_buf); if (nbytes < 0) { err = nbytes; goto out; } if ((nbytes == 0 && length != 0) || nbytes < mincount) { - kvfree(work->aux_payload_buf); - work->aux_payload_buf = NULL; + kvfree(aux_payload_buf); rsp->hdr.Status = STATUS_END_OF_FILE; smb2_set_err_rsp(work); ksmbd_fd_put(work, fp); @@ -6308,11 +6309,10 @@ int smb2_read(struct ksmbd_work *work) if (is_rdma_channel == true) { /* write data to the client using rdma channel */ remain_bytes = smb2_read_rdma_channel(work, req, - work->aux_payload_buf, + aux_payload_buf, nbytes); - kvfree(work->aux_payload_buf); - work->aux_payload_buf = NULL; - + kvfree(aux_payload_buf); + aux_payload_buf = NULL; nbytes = 0; if (remain_bytes < 0) { err = (int)remain_bytes; @@ -6326,10 +6326,11 @@ int smb2_read(struct ksmbd_work *work) rsp->DataLength = cpu_to_le32(nbytes); rsp->DataRemaining = cpu_to_le32(remain_bytes); rsp->Flags = 0; - inc_rfc1001_len(work->response_buf, 16); - work->resp_hdr_sz = get_rfc1002_len(work->response_buf) + 4; - work->aux_payload_sz = nbytes; - inc_rfc1001_len(work->response_buf, nbytes); + err = ksmbd_iov_pin_rsp_read(work, (void *)rsp, + offsetof(struct smb2_read_rsp, Buffer), + aux_payload_buf, nbytes); + if (err) + goto out; ksmbd_fd_put(work, fp); return 0; @@ -6412,8 +6413,8 @@ static noinline int smb2_write_pipe(struct ksmbd_work *work) rsp->DataLength = cpu_to_le32(length); rsp->DataRemaining = 0; rsp->Reserved2 = 0; - inc_rfc1001_len(work->response_buf, 16); - return 0; + err = ksmbd_iov_pin_rsp(work, (void *)rsp, + offsetof(struct smb2_write_rsp, Buffer)); out: if (err) { rsp->hdr.Status = STATUS_INVALID_HANDLE; @@ -6569,7 +6570,9 @@ int smb2_write(struct ksmbd_work *work) rsp->DataLength = cpu_to_le32(nbytes); rsp->DataRemaining = 0; rsp->Reserved2 = 0; - inc_rfc1001_len(work->response_buf, 16); + err = ksmbd_iov_pin_rsp(work, rsp, offsetof(struct smb2_write_rsp, Buffer)); + if (err) + goto out; ksmbd_fd_put(work, fp); return 0; @@ -6616,15 +6619,11 @@ int smb2_flush(struct ksmbd_work *work) rsp->StructureSize = cpu_to_le16(4); rsp->Reserved = 0; - inc_rfc1001_len(work->response_buf, 4); - return 0; + return ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_flush_rsp)); out: - if (err) { - rsp->hdr.Status = STATUS_INVALID_HANDLE; - smb2_set_err_rsp(work); - } - + rsp->hdr.Status = STATUS_INVALID_HANDLE; + smb2_set_err_rsp(work); return err; } @@ -7078,8 +7077,6 @@ skip: goto out; } - init_smb2_rsp_hdr(work); - smb2_set_err_rsp(work); rsp->hdr.Status = STATUS_RANGE_NOT_LOCKED; kfree(smb_lock); @@ -7114,7 +7111,10 @@ skip: ksmbd_debug(SMB, "successful in taking lock\n"); rsp->hdr.Status = STATUS_SUCCESS; rsp->Reserved = 0; - inc_rfc1001_len(work->response_buf, 4); + err = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_lock_rsp)); + if (err) + goto out; + ksmbd_fd_put(work, fp); return 0; @@ -7910,9 +7910,9 @@ dup_ext_out: rsp->Reserved = cpu_to_le16(0); rsp->Flags = cpu_to_le32(0); rsp->Reserved2 = cpu_to_le32(0); - inc_rfc1001_len(work->response_buf, 48 + nbytes); - - return 0; + ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_ioctl_rsp) + nbytes); + if (!ret) + return ret; out: if (ret == -EACCES) @@ -8047,8 +8047,9 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work) rsp->Reserved2 = 0; rsp->VolatileFid = volatile_id; rsp->PersistentFid = persistent_id; - inc_rfc1001_len(work->response_buf, 24); - return; + ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_oplock_break)); + if (!ret) + return; err_out: opinfo->op_state = OPLOCK_STATE_NONE; @@ -8198,8 +8199,9 @@ static void smb21_lease_break_ack(struct ksmbd_work *work) memcpy(rsp->LeaseKey, req->LeaseKey, 16); rsp->LeaseState = lease_state; rsp->LeaseDuration = 0; - inc_rfc1001_len(work->response_buf, 36); - return; + ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_lease_ack)); + if (!ret) + return; err_out: opinfo->op_state = OPLOCK_STATE_NONE; @@ -8337,43 +8339,19 @@ int smb2_check_sign_req(struct ksmbd_work *work) void smb2_set_sign_rsp(struct ksmbd_work *work) { struct smb2_hdr *hdr; - struct smb2_hdr *req_hdr; char signature[SMB2_HMACSHA256_SIZE]; - struct kvec iov[2]; - size_t len; + struct kvec *iov; int n_vec = 1; - hdr = smb2_get_msg(work->response_buf); - if (work->next_smb2_rsp_hdr_off) - hdr = ksmbd_resp_buf_next(work); - - req_hdr = ksmbd_req_buf_next(work); - - if (!work->next_smb2_rsp_hdr_off) { - len = get_rfc1002_len(work->response_buf); - if (req_hdr->NextCommand) - len = ALIGN(len, 8); - } else { - len = get_rfc1002_len(work->response_buf) - - work->next_smb2_rsp_hdr_off; - len = ALIGN(len, 8); - } - - if (req_hdr->NextCommand) - hdr->NextCommand = cpu_to_le32(len); - + hdr = ksmbd_resp_buf_curr(work); hdr->Flags |= SMB2_FLAGS_SIGNED; memset(hdr->Signature, 0, SMB2_SIGNATURE_SIZE); - iov[0].iov_base = (char *)&hdr->ProtocolId; - iov[0].iov_len = len; - - if (work->aux_payload_sz) { - iov[0].iov_len -= work->aux_payload_sz; - - iov[1].iov_base = work->aux_payload_buf; - iov[1].iov_len = work->aux_payload_sz; + if (hdr->Command == SMB2_READ) { + iov = &work->iov[work->iov_idx - 1]; n_vec++; + } else { + iov = &work->iov[work->iov_idx]; } if (!ksmbd_sign_smb2_pdu(work->conn, work->sess->sess_key, iov, n_vec, @@ -8449,29 +8427,14 @@ int smb3_check_sign_req(struct ksmbd_work *work) void smb3_set_sign_rsp(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; - struct smb2_hdr *req_hdr, *hdr; + struct smb2_hdr *hdr; struct channel *chann; char signature[SMB2_CMACAES_SIZE]; - struct kvec iov[2]; + struct kvec *iov; int n_vec = 1; - size_t len; char *signing_key; - hdr = smb2_get_msg(work->response_buf); - if (work->next_smb2_rsp_hdr_off) - hdr = ksmbd_resp_buf_next(work); - - req_hdr = ksmbd_req_buf_next(work); - - if (!work->next_smb2_rsp_hdr_off) { - len = get_rfc1002_len(work->response_buf); - if (req_hdr->NextCommand) - len = ALIGN(len, 8); - } else { - len = get_rfc1002_len(work->response_buf) - - work->next_smb2_rsp_hdr_off; - len = ALIGN(len, 8); - } + hdr = ksmbd_resp_buf_curr(work); if (conn->binding == false && le16_to_cpu(hdr->Command) == SMB2_SESSION_SETUP_HE) { @@ -8487,21 +8450,18 @@ void smb3_set_sign_rsp(struct ksmbd_work *work) if (!signing_key) return; - if (req_hdr->NextCommand) - hdr->NextCommand = cpu_to_le32(len); - hdr->Flags |= SMB2_FLAGS_SIGNED; memset(hdr->Signature, 0, SMB2_SIGNATURE_SIZE); - iov[0].iov_base = (char *)&hdr->ProtocolId; - iov[0].iov_len = len; - if (work->aux_payload_sz) { - iov[0].iov_len -= work->aux_payload_sz; - iov[1].iov_base = work->aux_payload_buf; - iov[1].iov_len = work->aux_payload_sz; + + if (hdr->Command == SMB2_READ) { + iov = &work->iov[work->iov_idx - 1]; n_vec++; + } else { + iov = &work->iov[work->iov_idx]; } - if (!ksmbd_sign_smb3_pdu(conn, signing_key, iov, n_vec, signature)) + if (!ksmbd_sign_smb3_pdu(conn, signing_key, iov, n_vec, + signature)) memcpy(hdr->Signature, signature, SMB2_SIGNATURE_SIZE); } @@ -8568,45 +8528,22 @@ static void fill_transform_hdr(void *tr_buf, char *old_buf, __le16 cipher_type) int smb3_encrypt_resp(struct ksmbd_work *work) { - char *buf = work->response_buf; - struct kvec iov[3]; + struct kvec *iov = work->iov; int rc = -ENOMEM; - int buf_size = 0, rq_nvec = 2 + (work->aux_payload_sz ? 1 : 0); + void *tr_buf; - if (ARRAY_SIZE(iov) < rq_nvec) - return -ENOMEM; - - work->tr_buf = kzalloc(sizeof(struct smb2_transform_hdr) + 4, GFP_KERNEL); - if (!work->tr_buf) + tr_buf = kzalloc(sizeof(struct smb2_transform_hdr) + 4, GFP_KERNEL); + if (!tr_buf) return rc; /* fill transform header */ - fill_transform_hdr(work->tr_buf, buf, work->conn->cipher_type); + fill_transform_hdr(tr_buf, work->response_buf, work->conn->cipher_type); - iov[0].iov_base = work->tr_buf; + iov[0].iov_base = tr_buf; iov[0].iov_len = sizeof(struct smb2_transform_hdr) + 4; - buf_size += iov[0].iov_len - 4; - - iov[1].iov_base = buf + 4; - iov[1].iov_len = get_rfc1002_len(buf); - if (work->aux_payload_sz) { - iov[1].iov_len = work->resp_hdr_sz - 4; - - iov[2].iov_base = work->aux_payload_buf; - iov[2].iov_len = work->aux_payload_sz; - buf_size += iov[2].iov_len; - } - buf_size += iov[1].iov_len; - work->resp_hdr_sz = iov[1].iov_len; + work->tr_buf = tr_buf; - rc = ksmbd_crypt_message(work, iov, rq_nvec, 1); - if (rc) - return rc; - - memmove(buf, iov[1].iov_base, iov[1].iov_len); - *(__be32 *)work->tr_buf = cpu_to_be32(buf_size); - - return rc; + return ksmbd_crypt_message(work, iov, work->iov_idx + 1, 1); } bool smb3_is_transform_hdr(void *buf) diff --git a/fs/smb/server/smb2pdu.h b/fs/smb/server/smb2pdu.h index 2767c08a534a..d12cfd3b0927 100644 --- a/fs/smb/server/smb2pdu.h +++ b/fs/smb/server/smb2pdu.h @@ -361,7 +361,7 @@ struct smb2_ea_info { __u8 Flags; __u8 EaNameLength; __le16 EaValueLength; - char name[1]; + char name[]; /* optionally followed by value */ } __packed; /* level 15 Query */ diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c index c2b75d898852..e6ba1e9b8589 100644 --- a/fs/smb/server/smb_common.c +++ b/fs/smb/server/smb_common.c @@ -319,12 +319,6 @@ static int init_smb1_rsp_hdr(struct ksmbd_work *work) struct smb_hdr *rsp_hdr = (struct smb_hdr *)work->response_buf; struct smb_hdr *rcv_hdr = (struct smb_hdr *)work->request_buf; - /* - * Remove 4 byte direct TCP header. - */ - *(__be32 *)work->response_buf = - cpu_to_be32(sizeof(struct smb_hdr) - 4); - rsp_hdr->Command = SMB_COM_NEGOTIATE; *(__le32 *)rsp_hdr->Protocol = SMB1_PROTO_NUMBER; rsp_hdr->Flags = SMBFLG_RESPONSE; @@ -560,10 +554,11 @@ static int smb_handle_negotiate(struct ksmbd_work *work) ksmbd_debug(SMB, "Unsupported SMB1 protocol\n"); - /* Add 2 byte bcc and 2 byte DialectIndex. */ - inc_rfc1001_len(work->response_buf, 4); - neg_rsp->hdr.Status.CifsError = STATUS_SUCCESS; + if (ksmbd_iov_pin_rsp(work, (void *)neg_rsp, + sizeof(struct smb_negotiate_rsp) - 4)) + return -ENOMEM; + neg_rsp->hdr.Status.CifsError = STATUS_SUCCESS; neg_rsp->hdr.WordCount = 1; neg_rsp->DialectIndex = cpu_to_le16(work->conn->dialect); neg_rsp->ByteCount = 0; diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c index e5e438bf5499..6c0305be895e 100644 --- a/fs/smb/server/smbacl.c +++ b/fs/smb/server/smbacl.c @@ -1420,7 +1420,6 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, out: posix_acl_release(fattr.cf_acls); posix_acl_release(fattr.cf_dacls); - mark_inode_dirty(inode); return rc; } diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index c06efc020bd9..3b269e1f523a 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -1241,14 +1241,12 @@ static int smb_direct_writev(struct ksmbd_transport *t, //FIXME: skip RFC1002 header.. buflen -= 4; - iov[0].iov_base += 4; - iov[0].iov_len -= 4; remaining_data_length = buflen; ksmbd_debug(RDMA, "Sending smb (RDMA): smb_len=%u\n", buflen); smb_direct_send_ctx_init(st, &send_ctx, need_invalidate, remote_key); - start = i = 0; + start = i = 1; buflen = 0; while (true) { buflen += iov[i].iov_len; @@ -1366,24 +1364,35 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, LIST_HEAD(msg_list); char *desc_buf; int credits_needed; - unsigned int desc_buf_len; - size_t total_length = 0; + unsigned int desc_buf_len, desc_num = 0; if (t->status != SMB_DIRECT_CS_CONNECTED) return -ENOTCONN; + if (buf_len > t->max_rdma_rw_size) + return -EINVAL; + /* calculate needed credits */ credits_needed = 0; desc_buf = buf; for (i = 0; i < desc_len / sizeof(*desc); i++) { + if (!buf_len) + break; + desc_buf_len = le32_to_cpu(desc[i].length); + if (!desc_buf_len) + return -EINVAL; + + if (desc_buf_len > buf_len) { + desc_buf_len = buf_len; + desc[i].length = cpu_to_le32(desc_buf_len); + buf_len = 0; + } credits_needed += calc_rw_credits(t, desc_buf, desc_buf_len); desc_buf += desc_buf_len; - total_length += desc_buf_len; - if (desc_buf_len == 0 || total_length > buf_len || - total_length > t->max_rdma_rw_size) - return -EINVAL; + buf_len -= desc_buf_len; + desc_num++; } ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n", @@ -1395,7 +1404,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, /* build rdma_rw_ctx for each descriptor */ desc_buf = buf; - for (i = 0; i < desc_len / sizeof(*desc); i++) { + for (i = 0; i < desc_num; i++) { msg = kzalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) + sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL); if (!msg) { diff --git a/fs/smb/server/unicode.c b/fs/smb/server/unicode.c index 9ae676906ed3..393dd4a7432b 100644 --- a/fs/smb/server/unicode.c +++ b/fs/smb/server/unicode.c @@ -11,7 +11,6 @@ #include <asm/unaligned.h> #include "glob.h" #include "unicode.h" -#include "uniupr.h" #include "smb_common.h" /* diff --git a/fs/smb/server/unicode.h b/fs/smb/server/unicode.h index 076f6034a789..28c7c736f7bb 100644 --- a/fs/smb/server/unicode.h +++ b/fs/smb/server/unicode.h @@ -18,49 +18,14 @@ * This is a compressed table of upper and lower case conversion. * */ -#ifndef _CIFS_UNICODE_H -#define _CIFS_UNICODE_H +#ifndef _SMB_UNICODE_H +#define _SMB_UNICODE_H #include <asm/byteorder.h> #include <linux/types.h> #include <linux/nls.h> #include <linux/unicode.h> - -#define UNIUPR_NOLOWER /* Example to not expand lower case tables */ - -/* - * Windows maps these to the user defined 16 bit Unicode range since they are - * reserved symbols (along with \ and /), otherwise illegal to store - * in filenames in NTFS - */ -#define UNI_ASTERISK ((__u16)('*' + 0xF000)) -#define UNI_QUESTION ((__u16)('?' + 0xF000)) -#define UNI_COLON ((__u16)(':' + 0xF000)) -#define UNI_GRTRTHAN ((__u16)('>' + 0xF000)) -#define UNI_LESSTHAN ((__u16)('<' + 0xF000)) -#define UNI_PIPE ((__u16)('|' + 0xF000)) -#define UNI_SLASH ((__u16)('\\' + 0xF000)) - -/* Just define what we want from uniupr.h. We don't want to define the tables - * in each source file. - */ -#ifndef UNICASERANGE_DEFINED -struct UniCaseRange { - wchar_t start; - wchar_t end; - signed char *table; -}; -#endif /* UNICASERANGE_DEFINED */ - -#ifndef UNIUPR_NOUPPER -extern signed char SmbUniUpperTable[512]; -extern const struct UniCaseRange SmbUniUpperRange[]; -#endif /* UNIUPR_NOUPPER */ - -#ifndef UNIUPR_NOLOWER -extern signed char CifsUniLowerTable[512]; -extern const struct UniCaseRange CifsUniLowerRange[]; -#endif /* UNIUPR_NOLOWER */ +#include "../../nls/nls_ucs2_utils.h" #ifdef __KERNEL__ int smb_strtoUTF16(__le16 *to, const char *from, int len, @@ -73,286 +38,4 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen, char *ksmbd_extract_sharename(struct unicode_map *um, const char *treename); #endif -/* - * UniStrcat: Concatenate the second string to the first - * - * Returns: - * Address of the first string - */ -static inline wchar_t *UniStrcat(wchar_t *ucs1, const wchar_t *ucs2) -{ - wchar_t *anchor = ucs1; /* save a pointer to start of ucs1 */ - - while (*ucs1++) - /*NULL*/; /* To end of first string */ - ucs1--; /* Return to the null */ - while ((*ucs1++ = *ucs2++)) - /*NULL*/; /* copy string 2 over */ - return anchor; -} - -/* - * UniStrchr: Find a character in a string - * - * Returns: - * Address of first occurrence of character in string - * or NULL if the character is not in the string - */ -static inline wchar_t *UniStrchr(const wchar_t *ucs, wchar_t uc) -{ - while ((*ucs != uc) && *ucs) - ucs++; - - if (*ucs == uc) - return (wchar_t *)ucs; - return NULL; -} - -/* - * UniStrcmp: Compare two strings - * - * Returns: - * < 0: First string is less than second - * = 0: Strings are equal - * > 0: First string is greater than second - */ -static inline int UniStrcmp(const wchar_t *ucs1, const wchar_t *ucs2) -{ - while ((*ucs1 == *ucs2) && *ucs1) { - ucs1++; - ucs2++; - } - return (int)*ucs1 - (int)*ucs2; -} - -/* - * UniStrcpy: Copy a string - */ -static inline wchar_t *UniStrcpy(wchar_t *ucs1, const wchar_t *ucs2) -{ - wchar_t *anchor = ucs1; /* save the start of result string */ - - while ((*ucs1++ = *ucs2++)) - /*NULL*/; - return anchor; -} - -/* - * UniStrlen: Return the length of a string (in 16 bit Unicode chars not bytes) - */ -static inline size_t UniStrlen(const wchar_t *ucs1) -{ - int i = 0; - - while (*ucs1++) - i++; - return i; -} - -/* - * UniStrnlen: Return the length (in 16 bit Unicode chars not bytes) of a - * string (length limited) - */ -static inline size_t UniStrnlen(const wchar_t *ucs1, int maxlen) -{ - int i = 0; - - while (*ucs1++) { - i++; - if (i >= maxlen) - break; - } - return i; -} - -/* - * UniStrncat: Concatenate length limited string - */ -static inline wchar_t *UniStrncat(wchar_t *ucs1, const wchar_t *ucs2, size_t n) -{ - wchar_t *anchor = ucs1; /* save pointer to string 1 */ - - while (*ucs1++) - /*NULL*/; - ucs1--; /* point to null terminator of s1 */ - while (n-- && (*ucs1 = *ucs2)) { /* copy s2 after s1 */ - ucs1++; - ucs2++; - } - *ucs1 = 0; /* Null terminate the result */ - return anchor; -} - -/* - * UniStrncmp: Compare length limited string - */ -static inline int UniStrncmp(const wchar_t *ucs1, const wchar_t *ucs2, size_t n) -{ - if (!n) - return 0; /* Null strings are equal */ - while ((*ucs1 == *ucs2) && *ucs1 && --n) { - ucs1++; - ucs2++; - } - return (int)*ucs1 - (int)*ucs2; -} - -/* - * UniStrncmp_le: Compare length limited string - native to little-endian - */ -static inline int -UniStrncmp_le(const wchar_t *ucs1, const wchar_t *ucs2, size_t n) -{ - if (!n) - return 0; /* Null strings are equal */ - while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) { - ucs1++; - ucs2++; - } - return (int)*ucs1 - (int)__le16_to_cpu(*ucs2); -} - -/* - * UniStrncpy: Copy length limited string with pad - */ -static inline wchar_t *UniStrncpy(wchar_t *ucs1, const wchar_t *ucs2, size_t n) -{ - wchar_t *anchor = ucs1; - - while (n-- && *ucs2) /* Copy the strings */ - *ucs1++ = *ucs2++; - - n++; - while (n--) /* Pad with nulls */ - *ucs1++ = 0; - return anchor; -} - -/* - * UniStrncpy_le: Copy length limited string with pad to little-endian - */ -static inline wchar_t *UniStrncpy_le(wchar_t *ucs1, const wchar_t *ucs2, size_t n) -{ - wchar_t *anchor = ucs1; - - while (n-- && *ucs2) /* Copy the strings */ - *ucs1++ = __le16_to_cpu(*ucs2++); - - n++; - while (n--) /* Pad with nulls */ - *ucs1++ = 0; - return anchor; -} - -/* - * UniStrstr: Find a string in a string - * - * Returns: - * Address of first match found - * NULL if no matching string is found - */ -static inline wchar_t *UniStrstr(const wchar_t *ucs1, const wchar_t *ucs2) -{ - const wchar_t *anchor1 = ucs1; - const wchar_t *anchor2 = ucs2; - - while (*ucs1) { - if (*ucs1 == *ucs2) { - /* Partial match found */ - ucs1++; - ucs2++; - } else { - if (!*ucs2) /* Match found */ - return (wchar_t *)anchor1; - ucs1 = ++anchor1; /* No match */ - ucs2 = anchor2; - } - } - - if (!*ucs2) /* Both end together */ - return (wchar_t *)anchor1; /* Match found */ - return NULL; /* No match */ -} - -#ifndef UNIUPR_NOUPPER -/* - * UniToupper: Convert a unicode character to upper case - */ -static inline wchar_t UniToupper(register wchar_t uc) -{ - register const struct UniCaseRange *rp; - - if (uc < sizeof(SmbUniUpperTable)) { - /* Latin characters */ - return uc + SmbUniUpperTable[uc]; /* Use base tables */ - } - - rp = SmbUniUpperRange; /* Use range tables */ - while (rp->start) { - if (uc < rp->start) /* Before start of range */ - return uc; /* Uppercase = input */ - if (uc <= rp->end) /* In range */ - return uc + rp->table[uc - rp->start]; - rp++; /* Try next range */ - } - return uc; /* Past last range */ -} - -/* - * UniStrupr: Upper case a unicode string - */ -static inline __le16 *UniStrupr(register __le16 *upin) -{ - register __le16 *up; - - up = upin; - while (*up) { /* For all characters */ - *up = cpu_to_le16(UniToupper(le16_to_cpu(*up))); - up++; - } - return upin; /* Return input pointer */ -} -#endif /* UNIUPR_NOUPPER */ - -#ifndef UNIUPR_NOLOWER -/* - * UniTolower: Convert a unicode character to lower case - */ -static inline wchar_t UniTolower(register wchar_t uc) -{ - register const struct UniCaseRange *rp; - - if (uc < sizeof(CifsUniLowerTable)) { - /* Latin characters */ - return uc + CifsUniLowerTable[uc]; /* Use base tables */ - } - - rp = CifsUniLowerRange; /* Use range tables */ - while (rp->start) { - if (uc < rp->start) /* Before start of range */ - return uc; /* Uppercase = input */ - if (uc <= rp->end) /* In range */ - return uc + rp->table[uc - rp->start]; - rp++; /* Try next range */ - } - return uc; /* Past last range */ -} - -/* - * UniStrlwr: Lower case a unicode string - */ -static inline wchar_t *UniStrlwr(register wchar_t *upin) -{ - register wchar_t *up; - - up = upin; - while (*up) { /* For all characters */ - *up = UniTolower(*up); - up++; - } - return upin; /* Return input pointer */ -} - -#endif - -#endif /* _CIFS_UNICODE_H */ +#endif /* _SMB_UNICODE_H */ diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 3d5d652153a5..b5a5e50fc9ca 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -367,15 +367,15 @@ out: * @fid: file id of open file * @count: read byte count * @pos: file pos + * @rbuf: read data buffer * * Return: number of read bytes on success, otherwise error */ int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, size_t count, - loff_t *pos) + loff_t *pos, char *rbuf) { struct file *filp = fp->filp; ssize_t nbytes = 0; - char *rbuf = work->aux_payload_buf; struct inode *inode = file_inode(filp); if (S_ISDIR(inode->i_mode)) @@ -1659,7 +1659,8 @@ int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work, u64 time; int rc; - generic_fillattr(idmap, d_inode(dentry), ksmbd_kstat->kstat); + generic_fillattr(idmap, STATX_BASIC_STATS, d_inode(dentry), + ksmbd_kstat->kstat); time = ksmbd_UnixTimeToNT(ksmbd_kstat->kstat->ctime); ksmbd_kstat->create_time = time; diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h index 72f9fb4b48d1..00968081856e 100644 --- a/fs/smb/server/vfs.h +++ b/fs/smb/server/vfs.h @@ -76,8 +76,8 @@ void ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap, struct dentry *dentry, __le32 *daccess); int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode); int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode); -int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, - size_t count, loff_t *pos); +int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, size_t count, + loff_t *pos, char *rbuf); int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp, char *buf, size_t count, loff_t *pos, bool sync, ssize_t *written); diff --git a/fs/splice.c b/fs/splice.c index 3e2a31e1ce6a..d983d375ff11 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -83,8 +83,7 @@ static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe, */ folio_wait_writeback(folio); - if (folio_has_private(folio) && - !filemap_release_folio(folio, GFP_KERNEL)) + if (!filemap_release_folio(folio, GFP_KERNEL)) goto out_unlock; /* @@ -120,17 +119,17 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - struct page *page = buf->page; + struct folio *folio = page_folio(buf->page); int err; - if (!PageUptodate(page)) { - lock_page(page); + if (!folio_test_uptodate(folio)) { + folio_lock(folio); /* - * Page got truncated/unhashed. This will cause a 0-byte + * Folio got truncated/unhashed. This will cause a 0-byte * splice, if this is the first page. */ - if (!page->mapping) { + if (!folio->mapping) { err = -ENODATA; goto error; } @@ -138,20 +137,18 @@ static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, /* * Uh oh, read-error from disk. */ - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { err = -EIO; goto error; } - /* - * Page is ok afterall, we are done. - */ - unlock_page(page); + /* Folio is ok after all, we are done */ + folio_unlock(folio); } return 0; error: - unlock_page(page); + folio_unlock(folio); return err; } @@ -1269,10 +1266,8 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, if ((in->f_flags | out->f_flags) & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; - return splice_pipe_to_pipe(ipipe, opipe, len, flags); - } - - if (ipipe) { + ret = splice_pipe_to_pipe(ipipe, opipe, len, flags); + } else if (ipipe) { if (off_in) return -ESPIPE; if (off_out) { @@ -1297,18 +1292,11 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, ret = do_splice_from(ipipe, out, &offset, len, flags); file_end_write(out); - if (ret > 0) - fsnotify_modify(out); - if (!off_out) out->f_pos = offset; else *off_out = offset; - - return ret; - } - - if (opipe) { + } else if (opipe) { if (off_out) return -ESPIPE; if (off_in) { @@ -1324,18 +1312,25 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, ret = splice_file_to_pipe(in, opipe, &offset, len, flags); - if (ret > 0) - fsnotify_access(in); - if (!off_in) in->f_pos = offset; else *off_in = offset; + } else { + ret = -EINVAL; + } - return ret; + if (ret > 0) { + /* + * Generate modify out before access in: + * do_splice_from() may've already sent modify out, + * and this ensures the events get merged. + */ + fsnotify_modify(out); + fsnotify_access(in); } - return -EINVAL; + return ret; } static long __do_splice(struct file *in, loff_t __user *off_in, @@ -1464,6 +1459,9 @@ static long vmsplice_to_user(struct file *file, struct iov_iter *iter, pipe_unlock(pipe); } + if (ret > 0) + fsnotify_access(file); + return ret; } @@ -1493,8 +1491,10 @@ static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter, if (!ret) ret = iter_to_pipe(iter, pipe, buf_flag); pipe_unlock(pipe); - if (ret > 0) + if (ret > 0) { wakeup_pipe_readers(pipe); + fsnotify_modify(file); + } return ret; } @@ -1928,6 +1928,11 @@ long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) } } + if (ret > 0) { + fsnotify_access(in); + fsnotify_modify(out); + } + return ret; } diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index 24463145b351..c6e626b00546 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -61,7 +61,7 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime); inode->i_atime.tv_sec = inode->i_mtime.tv_sec; - inode->i_ctime.tv_sec = inode->i_mtime.tv_sec; + inode_set_ctime(inode, inode->i_mtime.tv_sec, 0); inode->i_mode = le16_to_cpu(sqsh_ino->mode); inode->i_size = 0; diff --git a/fs/stack.c b/fs/stack.c index c9830924eb12..b5e01bdb5f5f 100644 --- a/fs/stack.c +++ b/fs/stack.c @@ -68,7 +68,7 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src) dest->i_rdev = src->i_rdev; dest->i_atime = src->i_atime; dest->i_mtime = src->i_mtime; - dest->i_ctime = src->i_ctime; + inode_set_ctime_to_ts(dest, inode_get_ctime(src)); dest->i_blkbits = src->i_blkbits; dest->i_flags = src->i_flags; set_nlink(dest, src->i_nlink); diff --git a/fs/stat.c b/fs/stat.c index 7c238da22ef0..d43a5cc1bfa4 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -28,9 +28,10 @@ /** * generic_fillattr - Fill in the basic attributes from the inode struct - * @idmap: idmap of the mount the inode was found from - * @inode: Inode to use as the source - * @stat: Where to fill in the attributes + * @idmap: idmap of the mount the inode was found from + * @request_mask: statx request_mask + * @inode: Inode to use as the source + * @stat: Where to fill in the attributes * * Fill in the basic attributes in the kstat structure from data that's to be * found on the VFS inode structure. This is the default if no getattr inode @@ -42,8 +43,8 @@ * uid and gid filds. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply passs @nop_mnt_idmap. */ -void generic_fillattr(struct mnt_idmap *idmap, struct inode *inode, - struct kstat *stat) +void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask, + struct inode *inode, struct kstat *stat) { vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); @@ -58,9 +59,15 @@ void generic_fillattr(struct mnt_idmap *idmap, struct inode *inode, stat->size = i_size_read(inode); stat->atime = inode->i_atime; stat->mtime = inode->i_mtime; - stat->ctime = inode->i_ctime; + stat->ctime = inode_get_ctime(inode); stat->blksize = i_blocksize(inode); stat->blocks = inode->i_blocks; + + if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { + stat->result_mask |= STATX_CHANGE_COOKIE; + stat->change_cookie = inode_query_iversion(inode); + } + } EXPORT_SYMBOL(generic_fillattr); @@ -123,17 +130,12 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT | STATX_ATTR_DAX); - if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { - stat->result_mask |= STATX_CHANGE_COOKIE; - stat->change_cookie = inode_query_iversion(inode); - } - idmap = mnt_idmap(path->mnt); if (inode->i_op->getattr) return inode->i_op->getattr(idmap, path, stat, request_mask, query_flags); - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); return 0; } EXPORT_SYMBOL(vfs_getattr_nosec); @@ -272,6 +274,23 @@ int vfs_fstatat(int dfd, const char __user *filename, int statx_flags = flags | AT_NO_AUTOMOUNT; struct filename *name; + /* + * Work around glibc turning fstat() into fstatat(AT_EMPTY_PATH) + * + * If AT_EMPTY_PATH is set, we expect the common case to be that + * empty path, and avoid doing all the extra pathname work. + */ + if (dfd >= 0 && flags == AT_EMPTY_PATH) { + char c; + + ret = get_user(c, filename); + if (unlikely(ret)) + return ret; + + if (likely(!c)) + return vfs_fstat(dfd, stat); + } + name = getname_flags(filename, getname_statx_lookup_flags(statx_flags), NULL); ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS); putname(name); @@ -363,12 +382,6 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat #ifdef __ARCH_WANT_NEW_STAT -#if BITS_PER_LONG == 32 -# define choose_32_64(a,b) a -#else -# define choose_32_64(a,b) b -#endif - #ifndef INIT_STRUCT_STAT_PADDING # define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st)) #endif diff --git a/fs/super.c b/fs/super.c index e781226e2880..2d762ce67f6e 100644 --- a/fs/super.c +++ b/fs/super.c @@ -39,7 +39,7 @@ #include <uapi/linux/mount.h> #include "internal.h" -static int thaw_super_locked(struct super_block *sb); +static int thaw_super_locked(struct super_block *sb, enum freeze_holder who); static LIST_HEAD(super_blocks); static DEFINE_SPINLOCK(sb_lock); @@ -50,6 +50,130 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = { "sb_internal", }; +static inline void __super_lock(struct super_block *sb, bool excl) +{ + if (excl) + down_write(&sb->s_umount); + else + down_read(&sb->s_umount); +} + +static inline void super_unlock(struct super_block *sb, bool excl) +{ + if (excl) + up_write(&sb->s_umount); + else + up_read(&sb->s_umount); +} + +static inline void __super_lock_excl(struct super_block *sb) +{ + __super_lock(sb, true); +} + +static inline void super_unlock_excl(struct super_block *sb) +{ + super_unlock(sb, true); +} + +static inline void super_unlock_shared(struct super_block *sb) +{ + super_unlock(sb, false); +} + +static inline bool wait_born(struct super_block *sb) +{ + unsigned int flags; + + /* + * Pairs with smp_store_release() in super_wake() and ensures + * that we see SB_BORN or SB_DYING after we're woken. + */ + flags = smp_load_acquire(&sb->s_flags); + return flags & (SB_BORN | SB_DYING); +} + +/** + * super_lock - wait for superblock to become ready and lock it + * @sb: superblock to wait for + * @excl: whether exclusive access is required + * + * If the superblock has neither passed through vfs_get_tree() or + * generic_shutdown_super() yet wait for it to happen. Either superblock + * creation will succeed and SB_BORN is set by vfs_get_tree() or we're + * woken and we'll see SB_DYING. + * + * The caller must have acquired a temporary reference on @sb->s_count. + * + * Return: This returns true if SB_BORN was set, false if SB_DYING was + * set. The function acquires s_umount and returns with it held. + */ +static __must_check bool super_lock(struct super_block *sb, bool excl) +{ + + lockdep_assert_not_held(&sb->s_umount); + +relock: + __super_lock(sb, excl); + + /* + * Has gone through generic_shutdown_super() in the meantime. + * @sb->s_root is NULL and @sb->s_active is 0. No one needs to + * grab a reference to this. Tell them so. + */ + if (sb->s_flags & SB_DYING) + return false; + + /* Has called ->get_tree() successfully. */ + if (sb->s_flags & SB_BORN) + return true; + + super_unlock(sb, excl); + + /* wait until the superblock is ready or dying */ + wait_var_event(&sb->s_flags, wait_born(sb)); + + /* + * Neither SB_BORN nor SB_DYING are ever unset so we never loop. + * Just reacquire @sb->s_umount for the caller. + */ + goto relock; +} + +/* wait and acquire read-side of @sb->s_umount */ +static inline bool super_lock_shared(struct super_block *sb) +{ + return super_lock(sb, false); +} + +/* wait and acquire write-side of @sb->s_umount */ +static inline bool super_lock_excl(struct super_block *sb) +{ + return super_lock(sb, true); +} + +/* wake waiters */ +#define SUPER_WAKE_FLAGS (SB_BORN | SB_DYING | SB_DEAD) +static void super_wake(struct super_block *sb, unsigned int flag) +{ + WARN_ON_ONCE((flag & ~SUPER_WAKE_FLAGS)); + WARN_ON_ONCE(hweight32(flag & SUPER_WAKE_FLAGS) > 1); + + /* + * Pairs with smp_load_acquire() in super_lock() to make sure + * all initializations in the superblock are seen by the user + * seeing SB_BORN sent. + */ + smp_store_release(&sb->s_flags, sb->s_flags | flag); + /* + * Pairs with the barrier in prepare_to_wait_event() to make sure + * ___wait_var_event() either sees SB_BORN set or + * waitqueue_active() check in wake_up_var() sees the waiter. + */ + smp_mb(); + wake_up_var(&sb->s_flags); +} + /* * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. @@ -76,7 +200,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink, if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; - if (!trylock_super(sb)) + if (!super_trylock_shared(sb)) return SHRINK_STOP; if (sb->s_op->nr_cached_objects) @@ -110,7 +234,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink, freed += sb->s_op->free_cached_objects(sb, sc); } - up_read(&sb->s_umount); + super_unlock_shared(sb); return freed; } @@ -123,17 +247,17 @@ static unsigned long super_cache_count(struct shrinker *shrink, sb = container_of(shrink, struct super_block, s_shrink); /* - * We don't call trylock_super() here as it is a scalability bottleneck, - * so we're exposed to partial setup state. The shrinker rwsem does not - * protect filesystem operations backing list_lru_shrink_count() or - * s_op->nr_cached_objects(). Counts can change between - * super_cache_count and super_cache_scan, so we really don't need locks - * here. + * We don't call super_trylock_shared() here as it is a scalability + * bottleneck, so we're exposed to partial setup state. The shrinker + * rwsem does not protect filesystem operations backing + * list_lru_shrink_count() or s_op->nr_cached_objects(). Counts can + * change between super_cache_count and super_cache_scan, so we really + * don't need locks here. * * However, if we are currently mounting the superblock, the underlying * filesystem might be in a state of partial construction and hence it - * is dangerous to access it. trylock_super() uses a SB_BORN check to - * avoid this situation, so do the same here. The memory barrier is + * is dangerous to access it. super_trylock_shared() uses a SB_BORN check + * to avoid this situation, so do the same here. The memory barrier is * matched with the one in mount_fs() as we don't hold locks here. */ if (!(sb->s_flags & SB_BORN)) @@ -176,7 +300,7 @@ static void destroy_unused_super(struct super_block *s) { if (!s) return; - up_write(&s->s_umount); + super_unlock_excl(s); list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); security_sb_free(s); @@ -310,6 +434,33 @@ void put_super(struct super_block *sb) spin_unlock(&sb_lock); } +static void kill_super_notify(struct super_block *sb) +{ + lockdep_assert_not_held(&sb->s_umount); + + /* already notified earlier */ + if (sb->s_flags & SB_DEAD) + return; + + /* + * Remove it from @fs_supers so it isn't found by new + * sget{_fc}() walkers anymore. Any concurrent mounter still + * managing to grab a temporary reference is guaranteed to + * already see SB_DYING and will wait until we notify them about + * SB_DEAD. + */ + spin_lock(&sb_lock); + hlist_del_init(&sb->s_instances); + spin_unlock(&sb_lock); + + /* + * Let concurrent mounts know that this thing is really dead. + * We don't need @sb->s_umount here as every concurrent caller + * will see SB_DYING and either discard the superblock or wait + * for SB_DEAD. + */ + super_wake(sb, SB_DEAD); +} /** * deactivate_locked_super - drop an active reference to superblock @@ -329,6 +480,8 @@ void deactivate_locked_super(struct super_block *s) unregister_shrinker(&s->s_shrink); fs->kill_sb(s); + kill_super_notify(s); + /* * Since list_lru_destroy() may sleep, we cannot call it from * put_super(), where we hold the sb_lock. Therefore we destroy @@ -340,7 +493,7 @@ void deactivate_locked_super(struct super_block *s) put_filesystem(fs); put_super(s); } else { - up_write(&s->s_umount); + super_unlock_excl(s); } } @@ -357,7 +510,7 @@ EXPORT_SYMBOL(deactivate_locked_super); void deactivate_super(struct super_block *s) { if (!atomic_add_unless(&s->s_active, -1, 1)) { - down_write(&s->s_umount); + __super_lock_excl(s); deactivate_locked_super(s); } } @@ -379,20 +532,61 @@ EXPORT_SYMBOL(deactivate_super); */ static int grab_super(struct super_block *s) __releases(sb_lock) { + bool born; + s->s_count++; spin_unlock(&sb_lock); - down_write(&s->s_umount); - if ((s->s_flags & SB_BORN) && atomic_inc_not_zero(&s->s_active)) { + born = super_lock_excl(s); + if (born && atomic_inc_not_zero(&s->s_active)) { put_super(s); return 1; } - up_write(&s->s_umount); + super_unlock_excl(s); put_super(s); return 0; } +static inline bool wait_dead(struct super_block *sb) +{ + unsigned int flags; + + /* + * Pairs with memory barrier in super_wake() and ensures + * that we see SB_DEAD after we're woken. + */ + flags = smp_load_acquire(&sb->s_flags); + return flags & SB_DEAD; +} + +/** + * grab_super_dead - acquire an active reference to a superblock + * @sb: superblock to acquire + * + * Acquire a temporary reference on a superblock and try to trade it for + * an active reference. This is used in sget{_fc}() to wait for a + * superblock to either become SB_BORN or for it to pass through + * sb->kill() and be marked as SB_DEAD. + * + * Return: This returns true if an active reference could be acquired, + * false if not. + */ +static bool grab_super_dead(struct super_block *sb) +{ + + sb->s_count++; + if (grab_super(sb)) { + put_super(sb); + lockdep_assert_held(&sb->s_umount); + return true; + } + wait_var_event(&sb->s_flags, wait_dead(sb)); + lockdep_assert_not_held(&sb->s_umount); + put_super(sb); + return false; +} + /* - * trylock_super - try to grab ->s_umount shared + * super_trylock_shared - try to grab ->s_umount shared * @sb: reference we are trying to grab * * Try to prevent fs shutdown. This is used in places where we @@ -408,13 +602,13 @@ static int grab_super(struct super_block *s) __releases(sb_lock) * of down_read(). There's a couple of places that are OK with that, but * it's very much not a general-purpose interface. */ -bool trylock_super(struct super_block *sb) +bool super_trylock_shared(struct super_block *sb) { if (down_read_trylock(&sb->s_umount)) { - if (!hlist_unhashed(&sb->s_instances) && - sb->s_root && (sb->s_flags & SB_BORN)) + if (!(sb->s_flags & SB_DYING) && sb->s_root && + (sb->s_flags & SB_BORN)) return true; - up_read(&sb->s_umount); + super_unlock_shared(sb); } return false; @@ -439,13 +633,13 @@ bool trylock_super(struct super_block *sb) void retire_super(struct super_block *sb) { WARN_ON(!sb->s_bdev); - down_write(&sb->s_umount); + __super_lock_excl(sb); if (sb->s_iflags & SB_I_PERSB_BDI) { bdi_unregister(sb->s_bdi); sb->s_iflags &= ~SB_I_PERSB_BDI; } sb->s_iflags |= SB_I_RETIRED; - up_write(&sb->s_umount); + super_unlock_excl(sb); } EXPORT_SYMBOL(retire_super); @@ -517,11 +711,17 @@ void generic_shutdown_super(struct super_block *sb) spin_unlock(&sb->s_inode_list_lock); } } - spin_lock(&sb_lock); - /* should be initialized for __put_super_and_need_restart() */ - hlist_del_init(&sb->s_instances); - spin_unlock(&sb_lock); - up_write(&sb->s_umount); + /* + * Broadcast to everyone that grabbed a temporary reference to this + * superblock before we removed it from @fs_supers that the superblock + * is dying. Every walker of @fs_supers outside of sget{_fc}() will now + * discard this superblock and treat it as dead. + * + * We leave the superblock on @fs_supers so it can be found by + * sget{_fc}() until we passed sb->kill_sb(). + */ + super_wake(sb, SB_DYING); + super_unlock_excl(sb); if (sb->s_bdi != &noop_backing_dev_info) { if (sb->s_iflags & SB_I_PERSB_BDI) bdi_unregister(sb->s_bdi); @@ -546,17 +746,31 @@ bool mount_capable(struct fs_context *fc) * @test: Comparison callback * @set: Setup callback * - * Find or create a superblock using the parameters stored in the filesystem - * context and the two callback functions. + * Create a new superblock or find an existing one. + * + * The @test callback is used to find a matching existing superblock. + * Whether or not the requested parameters in @fc are taken into account + * is specific to the @test callback that is used. They may even be + * completely ignored. + * + * If an extant superblock is matched, it will be returned unless: + * + * (1) the namespace the filesystem context @fc and the extant + * superblock's namespace differ + * + * (2) the filesystem context @fc has requested that reusing an extant + * superblock is not allowed * - * If an extant superblock is matched, then that will be returned with an - * elevated reference count that the caller must transfer or discard. + * In both cases EBUSY will be returned. * * If no match is made, a new superblock will be allocated and basic - * initialisation will be performed (s_type, s_fs_info and s_id will be set and - * the set() callback will be invoked), the superblock will be published and it - * will be returned in a partially constructed state with SB_BORN and SB_ACTIVE - * as yet unset. + * initialisation will be performed (s_type, s_fs_info and s_id will be + * set and the @set callback will be invoked), the superblock will be + * published and it will be returned in a partially constructed state + * with SB_BORN and SB_ACTIVE as yet unset. + * + * Return: On success, an extant or newly created superblock is + * returned. On failure an error pointer is returned. */ struct super_block *sget_fc(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), @@ -595,6 +809,11 @@ retry: s->s_type = fc->fs_type; s->s_iflags |= fc->s_iflags; strscpy(s->s_id, s->s_type->name, sizeof(s->s_id)); + /* + * Make the superblock visible on @super_blocks and @fs_supers. + * It's in a nascent state and users should wait on SB_BORN or + * SB_DYING to be set. + */ list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &s->s_type->fs_supers); spin_unlock(&sb_lock); @@ -603,12 +822,16 @@ retry: return s; share_extant_sb: - if (user_ns != old->s_user_ns) { + if (user_ns != old->s_user_ns || fc->exclusive) { spin_unlock(&sb_lock); destroy_unused_super(s); + if (fc->exclusive) + warnfc(fc, "reusing existing filesystem not allowed"); + else + warnfc(fc, "reusing existing filesystem in another namespace not allowed"); return ERR_PTR(-EBUSY); } - if (!grab_super(old)) + if (!grab_super_dead(old)) goto retry; destroy_unused_super(s); return old; @@ -652,7 +875,7 @@ retry: destroy_unused_super(s); return ERR_PTR(-EBUSY); } - if (!grab_super(old)) + if (!grab_super_dead(old)) goto retry; destroy_unused_super(s); return old; @@ -685,7 +908,7 @@ EXPORT_SYMBOL(sget); void drop_super(struct super_block *sb) { - up_read(&sb->s_umount); + super_unlock_shared(sb); put_super(sb); } @@ -693,7 +916,7 @@ EXPORT_SYMBOL(drop_super); void drop_super_exclusive(struct super_block *sb) { - up_write(&sb->s_umount); + super_unlock_excl(sb); put_super(sb); } EXPORT_SYMBOL(drop_super_exclusive); @@ -704,7 +927,8 @@ static void __iterate_supers(void (*f)(struct super_block *)) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) + /* Pairs with memory marrier in super_wake(). */ + if (smp_load_acquire(&sb->s_flags) & SB_DYING) continue; sb->s_count++; spin_unlock(&sb_lock); @@ -734,15 +958,15 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) - continue; + bool born; + sb->s_count++; spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (sb->s_root && (sb->s_flags & SB_BORN)) + born = super_lock_shared(sb); + if (born && sb->s_root) f(sb, arg); - up_read(&sb->s_umount); + super_unlock_shared(sb); spin_lock(&sb_lock); if (p) @@ -770,13 +994,15 @@ void iterate_supers_type(struct file_system_type *type, spin_lock(&sb_lock); hlist_for_each_entry(sb, &type->fs_supers, s_instances) { + bool born; + sb->s_count++; spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (sb->s_root && (sb->s_flags & SB_BORN)) + born = super_lock_shared(sb); + if (born && sb->s_root) f(sb, arg); - up_read(&sb->s_umount); + super_unlock_shared(sb); spin_lock(&sb_lock); if (p) @@ -791,43 +1017,6 @@ void iterate_supers_type(struct file_system_type *type, EXPORT_SYMBOL(iterate_supers_type); /** - * get_super - get the superblock of a device - * @bdev: device to get the superblock for - * - * Scans the superblock list and finds the superblock of the file system - * mounted on the device given. %NULL is returned if no match is found. - */ -struct super_block *get_super(struct block_device *bdev) -{ - struct super_block *sb; - - if (!bdev) - return NULL; - - spin_lock(&sb_lock); -rescan: - list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) - continue; - if (sb->s_bdev == bdev) { - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); - /* still alive? */ - if (sb->s_root && (sb->s_flags & SB_BORN)) - return sb; - up_read(&sb->s_umount); - /* nope, got unmounted */ - spin_lock(&sb_lock); - __put_super(sb); - goto rescan; - } - } - spin_unlock(&sb_lock); - return NULL; -} - -/** * get_active_super - get an active reference to the superblock of a device * @bdev: device to get the superblock for * @@ -842,15 +1031,12 @@ struct super_block *get_active_super(struct block_device *bdev) if (!bdev) return NULL; -restart: spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) - continue; if (sb->s_bdev == bdev) { if (!grab_super(sb)) - goto restart; - up_write(&sb->s_umount); + return NULL; + super_unlock_excl(sb); return sb; } } @@ -863,28 +1049,21 @@ struct super_block *user_get_super(dev_t dev, bool excl) struct super_block *sb; spin_lock(&sb_lock); -rescan: list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) - continue; if (sb->s_dev == dev) { + bool born; + sb->s_count++; spin_unlock(&sb_lock); - if (excl) - down_write(&sb->s_umount); - else - down_read(&sb->s_umount); /* still alive? */ - if (sb->s_root && (sb->s_flags & SB_BORN)) + born = super_lock(sb, excl); + if (born && sb->s_root) return sb; - if (excl) - up_write(&sb->s_umount); - else - up_read(&sb->s_umount); + super_unlock(sb, excl); /* nope, got unmounted */ spin_lock(&sb_lock); __put_super(sb); - goto rescan; + break; } } spin_unlock(&sb_lock); @@ -926,9 +1105,9 @@ int reconfigure_super(struct fs_context *fc) if (remount_ro) { if (!hlist_empty(&sb->s_pins)) { - up_write(&sb->s_umount); + super_unlock_excl(sb); group_pin_kill(&sb->s_pins); - down_write(&sb->s_umount); + __super_lock_excl(sb); if (!sb->s_root) return 0; if (sb->s_writers.frozen != SB_UNFROZEN) @@ -991,9 +1170,9 @@ cancel_readonly: static void do_emergency_remount_callback(struct super_block *sb) { - down_write(&sb->s_umount); - if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) && - !sb_rdonly(sb)) { + bool born = super_lock_excl(sb); + + if (born && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, @@ -1004,7 +1183,7 @@ static void do_emergency_remount_callback(struct super_block *sb) put_fs_context(fc); } } - up_write(&sb->s_umount); + super_unlock_excl(sb); } static void do_emergency_remount(struct work_struct *work) @@ -1027,12 +1206,15 @@ void emergency_remount(void) static void do_thaw_all_callback(struct super_block *sb) { - down_write(&sb->s_umount); - if (sb->s_root && sb->s_flags & SB_BORN) { - emergency_thaw_bdev(sb); - thaw_super_locked(sb); + bool born = super_lock_excl(sb); + + if (born && sb->s_root) { + if (IS_ENABLED(CONFIG_BLOCK)) + while (sb->s_bdev && !thaw_bdev(sb->s_bdev)) + pr_warn("Emergency Thaw on %pg\n", sb->s_bdev); + thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE); } else { - up_write(&sb->s_umount); + super_unlock_excl(sb); } } @@ -1108,6 +1290,7 @@ void kill_anon_super(struct super_block *sb) { dev_t dev = sb->s_dev; generic_shutdown_super(sb); + kill_super_notify(sb); free_anon_bdev(dev); } EXPORT_SYMBOL(kill_anon_super); @@ -1136,7 +1319,7 @@ static int test_single_super(struct super_block *s, struct fs_context *fc) return 1; } -static int vfs_get_super(struct fs_context *fc, bool reconf, +static int vfs_get_super(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*fill_super)(struct super_block *sb, struct fs_context *fc)) @@ -1154,19 +1337,9 @@ static int vfs_get_super(struct fs_context *fc, bool reconf, goto error; sb->s_flags |= SB_ACTIVE; - fc->root = dget(sb->s_root); - } else { - fc->root = dget(sb->s_root); - if (reconf) { - err = reconfigure_super(fc); - if (err < 0) { - dput(fc->root); - fc->root = NULL; - goto error; - } - } } + fc->root = dget(sb->s_root); return 0; error: @@ -1178,7 +1351,7 @@ int get_tree_nodev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { - return vfs_get_super(fc, false, NULL, fill_super); + return vfs_get_super(fc, NULL, fill_super); } EXPORT_SYMBOL(get_tree_nodev); @@ -1186,66 +1359,175 @@ int get_tree_single(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { - return vfs_get_super(fc, false, test_single_super, fill_super); + return vfs_get_super(fc, test_single_super, fill_super); } EXPORT_SYMBOL(get_tree_single); -int get_tree_single_reconf(struct fs_context *fc, - int (*fill_super)(struct super_block *sb, - struct fs_context *fc)) -{ - return vfs_get_super(fc, true, test_single_super, fill_super); -} -EXPORT_SYMBOL(get_tree_single_reconf); - int get_tree_keyed(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc), void *key) { fc->s_fs_info = key; - return vfs_get_super(fc, false, test_keyed_super, fill_super); + return vfs_get_super(fc, test_keyed_super, fill_super); } EXPORT_SYMBOL(get_tree_keyed); +static int set_bdev_super(struct super_block *s, void *data) +{ + s->s_dev = *(dev_t *)data; + return 0; +} + +static int super_s_dev_set(struct super_block *s, struct fs_context *fc) +{ + return set_bdev_super(s, fc->sget_key); +} + +static int super_s_dev_test(struct super_block *s, struct fs_context *fc) +{ + return !(s->s_iflags & SB_I_RETIRED) && + s->s_dev == *(dev_t *)fc->sget_key; +} + +/** + * sget_dev - Find or create a superblock by device number + * @fc: Filesystem context. + * @dev: device number + * + * Find or create a superblock using the provided device number that + * will be stored in fc->sget_key. + * + * If an extant superblock is matched, then that will be returned with + * an elevated reference count that the caller must transfer or discard. + * + * If no match is made, a new superblock will be allocated and basic + * initialisation will be performed (s_type, s_fs_info, s_id, s_dev will + * be set). The superblock will be published and it will be returned in + * a partially constructed state with SB_BORN and SB_ACTIVE as yet + * unset. + * + * Return: an existing or newly created superblock on success, an error + * pointer on failure. + */ +struct super_block *sget_dev(struct fs_context *fc, dev_t dev) +{ + fc->sget_key = &dev; + return sget_fc(fc, super_s_dev_test, super_s_dev_set); +} +EXPORT_SYMBOL(sget_dev); + #ifdef CONFIG_BLOCK -static void fs_mark_dead(struct block_device *bdev) +/* + * Lock a super block that the callers holds a reference to. + * + * The caller needs to ensure that the super_block isn't being freed while + * calling this function, e.g. by holding a lock over the call to this function + * and the place that clears the pointer to the superblock used by this function + * before freeing the superblock. + */ +static bool super_lock_shared_active(struct super_block *sb) { - struct super_block *sb; + bool born = super_lock_shared(sb); - sb = get_super(bdev); - if (!sb) + if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) { + super_unlock_shared(sb); + return false; + } + return true; +} + +static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) +{ + struct super_block *sb = bdev->bd_holder; + + /* bd_holder_lock ensures that the sb isn't freed */ + lockdep_assert_held(&bdev->bd_holder_lock); + + if (!super_lock_shared_active(sb)) return; + if (!surprise) + sync_filesystem(sb); + shrink_dcache_sb(sb); + invalidate_inodes(sb); if (sb->s_op->shutdown) sb->s_op->shutdown(sb); - drop_super(sb); -} -static const struct blk_holder_ops fs_holder_ops = { - .mark_dead = fs_mark_dead, -}; + super_unlock_shared(sb); +} -static int set_bdev_super(struct super_block *s, void *data) +static void fs_bdev_sync(struct block_device *bdev) { - s->s_bdev = data; - s->s_dev = s->s_bdev->bd_dev; - s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi); + struct super_block *sb = bdev->bd_holder; - if (bdev_stable_writes(s->s_bdev)) - s->s_iflags |= SB_I_STABLE_WRITES; - return 0; -} + lockdep_assert_held(&bdev->bd_holder_lock); -static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc) -{ - return set_bdev_super(s, fc->sget_key); + if (!super_lock_shared_active(sb)) + return; + sync_filesystem(sb); + super_unlock_shared(sb); } -static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc) +const struct blk_holder_ops fs_holder_ops = { + .mark_dead = fs_bdev_mark_dead, + .sync = fs_bdev_sync, +}; +EXPORT_SYMBOL_GPL(fs_holder_ops); + +int setup_bdev_super(struct super_block *sb, int sb_flags, + struct fs_context *fc) { - return !(s->s_iflags & SB_I_RETIRED) && s->s_bdev == fc->sget_key; + blk_mode_t mode = sb_open_mode(sb_flags); + struct block_device *bdev; + + bdev = blkdev_get_by_dev(sb->s_dev, mode, sb, &fs_holder_ops); + if (IS_ERR(bdev)) { + if (fc) + errorf(fc, "%s: Can't open blockdev", fc->source); + return PTR_ERR(bdev); + } + + /* + * This really should be in blkdev_get_by_dev, but right now can't due + * to legacy issues that require us to allow opening a block device node + * writable from userspace even for a read-only block device. + */ + if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { + blkdev_put(bdev, sb); + return -EACCES; + } + + /* + * Until SB_BORN flag is set, there can be no active superblock + * references and thus no filesystem freezing. get_active_super() will + * just loop waiting for SB_BORN so even freeze_bdev() cannot proceed. + * + * It is enough to check bdev was not frozen before we set s_bdev. + */ + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (bdev->bd_fsfreeze_count > 0) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + if (fc) + warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); + blkdev_put(bdev, sb); + return -EBUSY; + } + spin_lock(&sb_lock); + sb->s_bdev = bdev; + sb->s_bdi = bdi_get(bdev->bd_disk->bdi); + if (bdev_stable_writes(bdev)) + sb->s_iflags |= SB_I_STABLE_WRITES; + spin_unlock(&sb_lock); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + + snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); + shrinker_debugfs_rename(&sb->s_shrink, "sb-%s:%s", sb->s_type->name, + sb->s_id); + sb_set_blocksize(sb, block_size(bdev)); + return 0; } +EXPORT_SYMBOL_GPL(setup_bdev_super); /** * get_tree_bdev - Get a superblock based on a single block device @@ -1256,73 +1538,48 @@ int get_tree_bdev(struct fs_context *fc, int (*fill_super)(struct super_block *, struct fs_context *)) { - struct block_device *bdev; struct super_block *s; int error = 0; + dev_t dev; if (!fc->source) return invalf(fc, "No source specified"); - bdev = blkdev_get_by_path(fc->source, sb_open_mode(fc->sb_flags), - fc->fs_type, &fs_holder_ops); - if (IS_ERR(bdev)) { - errorf(fc, "%s: Can't open blockdev", fc->source); - return PTR_ERR(bdev); - } - - /* Once the superblock is inserted into the list by sget_fc(), s_umount - * will protect the lockfs code from trying to start a snapshot while - * we are mounting - */ - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (bdev->bd_fsfreeze_count > 0) { - mutex_unlock(&bdev->bd_fsfreeze_mutex); - warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); - blkdev_put(bdev, fc->fs_type); - return -EBUSY; + error = lookup_bdev(fc->source, &dev); + if (error) { + errorf(fc, "%s: Can't lookup blockdev", fc->source); + return error; } fc->sb_flags |= SB_NOSEC; - fc->sget_key = bdev; - s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - if (IS_ERR(s)) { - blkdev_put(bdev, fc->fs_type); + s = sget_dev(fc, dev); + if (IS_ERR(s)) return PTR_ERR(s); - } if (s->s_root) { /* Don't summarily change the RO/RW state. */ if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) { - warnf(fc, "%pg: Can't mount, would change RO state", bdev); + warnf(fc, "%pg: Can't mount, would change RO state", s->s_bdev); deactivate_locked_super(s); - blkdev_put(bdev, fc->fs_type); return -EBUSY; } - + } else { /* - * s_umount nests inside open_mutex during - * __invalidate_device(). blkdev_put() acquires - * open_mutex and can't be called under s_umount. Drop - * s_umount temporarily. This is safe as we're - * holding an active reference. + * We drop s_umount here because we need to open the bdev and + * bdev->open_mutex ranks above s_umount (blkdev_put() -> + * bdev_mark_dead()). It is safe because we have active sb + * reference and SB_BORN is not set yet. */ - up_write(&s->s_umount); - blkdev_put(bdev, fc->fs_type); - down_write(&s->s_umount); - } else { - snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); - shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", - fc->fs_type->name, s->s_id); - sb_set_blocksize(s, block_size(bdev)); - error = fill_super(s, fc); + super_unlock_excl(s); + error = setup_bdev_super(s, fc->sb_flags, fc); + __super_lock_excl(s); + if (!error) + error = fill_super(s, fc); if (error) { deactivate_locked_super(s); return error; } - s->s_flags |= SB_ACTIVE; - bdev->bd_super = s; } BUG_ON(fc->root); @@ -1333,79 +1590,52 @@ EXPORT_SYMBOL(get_tree_bdev); static int test_bdev_super(struct super_block *s, void *data) { - return !(s->s_iflags & SB_I_RETIRED) && (void *)s->s_bdev == data; + return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data; } struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)) { - struct block_device *bdev; struct super_block *s; - int error = 0; + int error; + dev_t dev; - bdev = blkdev_get_by_path(dev_name, sb_open_mode(flags), fs_type, - &fs_holder_ops); - if (IS_ERR(bdev)) - return ERR_CAST(bdev); + error = lookup_bdev(dev_name, &dev); + if (error) + return ERR_PTR(error); - /* - * once the super is inserted into the list by sget, s_umount - * will protect the lockfs code from trying to start a snapshot - * while we are mounting - */ - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (bdev->bd_fsfreeze_count > 0) { - mutex_unlock(&bdev->bd_fsfreeze_mutex); - error = -EBUSY; - goto error_bdev; - } - s = sget(fs_type, test_bdev_super, set_bdev_super, flags | SB_NOSEC, - bdev); - mutex_unlock(&bdev->bd_fsfreeze_mutex); + flags |= SB_NOSEC; + s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev); if (IS_ERR(s)) - goto error_s; + return ERR_CAST(s); if (s->s_root) { if ((flags ^ s->s_flags) & SB_RDONLY) { deactivate_locked_super(s); - error = -EBUSY; - goto error_bdev; + return ERR_PTR(-EBUSY); } - + } else { /* - * s_umount nests inside open_mutex during - * __invalidate_device(). blkdev_put() acquires - * open_mutex and can't be called under s_umount. Drop - * s_umount temporarily. This is safe as we're - * holding an active reference. + * We drop s_umount here because we need to open the bdev and + * bdev->open_mutex ranks above s_umount (blkdev_put() -> + * bdev_mark_dead()). It is safe because we have active sb + * reference and SB_BORN is not set yet. */ - up_write(&s->s_umount); - blkdev_put(bdev, fs_type); - down_write(&s->s_umount); - } else { - snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); - shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", - fs_type->name, s->s_id); - sb_set_blocksize(s, block_size(bdev)); - error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); + super_unlock_excl(s); + error = setup_bdev_super(s, flags, NULL); + __super_lock_excl(s); + if (!error) + error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); - goto error; + return ERR_PTR(error); } s->s_flags |= SB_ACTIVE; - bdev->bd_super = s; } return dget(s->s_root); - -error_s: - error = PTR_ERR(s); -error_bdev: - blkdev_put(bdev, fs_type); -error: - return ERR_PTR(error); } EXPORT_SYMBOL(mount_bdev); @@ -1413,10 +1643,11 @@ void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; - bdev->bd_super = NULL; generic_shutdown_super(sb); - sync_blockdev(bdev); - blkdev_put(bdev, sb->s_type); + if (bdev) { + sync_blockdev(bdev); + blkdev_put(bdev, sb); + } } EXPORT_SYMBOL(kill_block_super); @@ -1533,13 +1764,13 @@ int vfs_get_tree(struct fs_context *fc) WARN_ON(!sb->s_bdi); /* - * Write barrier is for super_cache_count(). We place it before setting - * SB_BORN as the data dependency between the two functions is the - * superblock structure contents that we just set up, not the SB_BORN - * flag. + * super_wake() contains a memory barrier which also care of + * ordering for super_cache_count(). We place it before setting + * SB_BORN as the data dependency between the two functions is + * the superblock structure contents that we just set up, not + * the SB_BORN flag. */ - smp_wmb(); - sb->s_flags |= SB_BORN; + super_wake(sb, SB_BORN); error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL); if (unlikely(error)) { @@ -1644,14 +1875,43 @@ static void sb_freeze_unlock(struct super_block *sb, int level) percpu_up_write(sb->s_writers.rw_sem + level); } +static int wait_for_partially_frozen(struct super_block *sb) +{ + int ret = 0; + + do { + unsigned short old = sb->s_writers.frozen; + + up_write(&sb->s_umount); + ret = wait_var_event_killable(&sb->s_writers.frozen, + sb->s_writers.frozen != old); + down_write(&sb->s_umount); + } while (ret == 0 && + sb->s_writers.frozen != SB_UNFROZEN && + sb->s_writers.frozen != SB_FREEZE_COMPLETE); + + return ret; +} + /** * freeze_super - lock the filesystem and force it into a consistent state * @sb: the super to lock + * @who: context that wants to freeze * * Syncs the super to make sure the filesystem is consistent and calls the fs's - * freeze_fs. Subsequent calls to this without first thawing the fs will return + * freeze_fs. Subsequent calls to this without first thawing the fs may return * -EBUSY. * + * @who should be: + * * %FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs; + * * %FREEZE_HOLDER_KERNEL if the kernel wants to freeze the fs. + * + * The @who argument distinguishes between the kernel and userspace trying to + * freeze the filesystem. Although there cannot be multiple kernel freezes or + * multiple userspace freezes in effect at any given time, the kernel and + * userspace can both hold a filesystem frozen. The filesystem remains frozen + * until there are no kernel or userspace freezes in effect. + * * During this function, sb->s_writers.frozen goes through these values: * * SB_UNFROZEN: File system is normal, all writes progress as usual. @@ -1677,34 +1937,62 @@ static void sb_freeze_unlock(struct super_block *sb, int level) * * sb->s_writers.frozen is protected by sb->s_umount. */ -int freeze_super(struct super_block *sb) +int freeze_super(struct super_block *sb, enum freeze_holder who) { int ret; atomic_inc(&sb->s_active); - down_write(&sb->s_umount); + if (!super_lock_excl(sb)) + WARN(1, "Dying superblock while freezing!"); + +retry: + if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { + if (sb->s_writers.freeze_holders & who) { + deactivate_locked_super(sb); + return -EBUSY; + } + + WARN_ON(sb->s_writers.freeze_holders == 0); + + /* + * Someone else already holds this type of freeze; share the + * freeze and assign the active ref to the freeze. + */ + sb->s_writers.freeze_holders |= who; + super_unlock_excl(sb); + return 0; + } + if (sb->s_writers.frozen != SB_UNFROZEN) { - deactivate_locked_super(sb); - return -EBUSY; + ret = wait_for_partially_frozen(sb); + if (ret) { + deactivate_locked_super(sb); + return ret; + } + + goto retry; } if (!(sb->s_flags & SB_BORN)) { - up_write(&sb->s_umount); + super_unlock_excl(sb); return 0; /* sic - it's "nothing to do" */ } if (sb_rdonly(sb)) { /* Nothing to do really... */ + sb->s_writers.freeze_holders |= who; sb->s_writers.frozen = SB_FREEZE_COMPLETE; - up_write(&sb->s_umount); + wake_up_var(&sb->s_writers.frozen); + super_unlock_excl(sb); return 0; } sb->s_writers.frozen = SB_FREEZE_WRITE; /* Release s_umount to preserve sb_start_write -> s_umount ordering */ - up_write(&sb->s_umount); + super_unlock_excl(sb); sb_wait_write(sb, SB_FREEZE_WRITE); - down_write(&sb->s_umount); + if (!super_lock_excl(sb)) + WARN(1, "Dying superblock while freezing!"); /* Now we go and block page faults... */ sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; @@ -1715,6 +2003,7 @@ int freeze_super(struct super_block *sb) if (ret) { sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT); + wake_up_var(&sb->s_writers.frozen); deactivate_locked_super(sb); return ret; } @@ -1730,6 +2019,7 @@ int freeze_super(struct super_block *sb) "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_FS); + wake_up_var(&sb->s_writers.frozen); deactivate_locked_super(sb); return ret; } @@ -1738,24 +2028,50 @@ int freeze_super(struct super_block *sb) * For debugging purposes so that fs can warn if it sees write activity * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super(). */ + sb->s_writers.freeze_holders |= who; sb->s_writers.frozen = SB_FREEZE_COMPLETE; + wake_up_var(&sb->s_writers.frozen); lockdep_sb_freeze_release(sb); - up_write(&sb->s_umount); + super_unlock_excl(sb); return 0; } EXPORT_SYMBOL(freeze_super); -static int thaw_super_locked(struct super_block *sb) +/* + * Undoes the effect of a freeze_super_locked call. If the filesystem is + * frozen both by userspace and the kernel, a thaw call from either source + * removes that state without releasing the other state or unlocking the + * filesystem. + */ +static int thaw_super_locked(struct super_block *sb, enum freeze_holder who) { int error; - if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) { - up_write(&sb->s_umount); + if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { + if (!(sb->s_writers.freeze_holders & who)) { + super_unlock_excl(sb); + return -EINVAL; + } + + /* + * Freeze is shared with someone else. Release our hold and + * drop the active ref that freeze_super assigned to the + * freezer. + */ + if (sb->s_writers.freeze_holders & ~who) { + sb->s_writers.freeze_holders &= ~who; + deactivate_locked_super(sb); + return 0; + } + } else { + super_unlock_excl(sb); return -EINVAL; } if (sb_rdonly(sb)) { + sb->s_writers.freeze_holders &= ~who; sb->s_writers.frozen = SB_UNFROZEN; + wake_up_var(&sb->s_writers.frozen); goto out; } @@ -1764,15 +2080,16 @@ static int thaw_super_locked(struct super_block *sb) if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); if (error) { - printk(KERN_ERR - "VFS:Filesystem thaw failed\n"); + printk(KERN_ERR "VFS:Filesystem thaw failed\n"); lockdep_sb_freeze_release(sb); - up_write(&sb->s_umount); + super_unlock_excl(sb); return error; } } + sb->s_writers.freeze_holders &= ~who; sb->s_writers.frozen = SB_UNFROZEN; + wake_up_var(&sb->s_writers.frozen); sb_freeze_unlock(sb, SB_FREEZE_FS); out: deactivate_locked_super(sb); @@ -1782,13 +2099,20 @@ out: /** * thaw_super -- unlock filesystem * @sb: the super to thaw + * @who: context that wants to freeze + * + * Unlocks the filesystem and marks it writeable again after freeze_super() + * if there are no remaining freezes on the filesystem. * - * Unlocks the filesystem and marks it writeable again after freeze_super(). + * @who should be: + * * %FREEZE_HOLDER_USERSPACE if userspace wants to thaw the fs; + * * %FREEZE_HOLDER_KERNEL if the kernel wants to thaw the fs. */ -int thaw_super(struct super_block *sb) +int thaw_super(struct super_block *sb, enum freeze_holder who) { - down_write(&sb->s_umount); - return thaw_super_locked(sb); + if (!super_lock_excl(sb)) + WARN(1, "Dying superblock while thawing!"); + return thaw_super_locked(sb, who); } EXPORT_SYMBOL(thaw_super); diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig index b4e23e03fbeb..67b3f90afbfd 100644 --- a/fs/sysv/Kconfig +++ b/fs/sysv/Kconfig @@ -2,6 +2,7 @@ config SYSV_FS tristate "System V/Xenix/V7/Coherent file system support" depends on BLOCK + select BUFFER_HEAD help SCO, Xenix and Coherent are commercial Unix systems for Intel machines, and Version 7 was used on the DEC PDP-11. Saying Y diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 0140010aa0c3..2f5ead88d00b 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -224,7 +224,7 @@ got_it: memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2); de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); dir_commit_chunk(page, pos, SYSV_DIRSIZE); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); err = sysv_handle_dirsync(dir); out_page: @@ -249,7 +249,7 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page) } de->inode = 0; dir_commit_chunk(page, pos, SYSV_DIRSIZE); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); return sysv_handle_dirsync(inode); } @@ -346,7 +346,7 @@ int sysv_set_link(struct sysv_dir_entry *de, struct page *page, } de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); dir_commit_chunk(page, pos, SYSV_DIRSIZE); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); return sysv_handle_dirsync(inode); } diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c index e732879036ab..6719da5889d9 100644 --- a/fs/sysv/ialloc.c +++ b/fs/sysv/ialloc.c @@ -165,7 +165,7 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode) dirty_sb(sb); inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = fs16_to_cpu(sbi, ino); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_blocks = 0; memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data)); SYSV_I(inode)->i_dir_start_lookup = 0; diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 9e8d4a6fb2f3..0aa3827d8178 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -202,8 +202,7 @@ struct inode *sysv_iget(struct super_block *sb, unsigned int ino) inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size); inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime); inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_mtime); - inode->i_ctime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_ctime); - inode->i_ctime.tv_nsec = 0; + inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->i_ctime), 0); inode->i_atime.tv_nsec = 0; inode->i_mtime.tv_nsec = 0; inode->i_blocks = 0; @@ -256,7 +255,7 @@ static int __sysv_write_inode(struct inode *inode, int wait) raw_inode->i_size = cpu_to_fs32(sbi, inode->i_size); raw_inode->i_atime = cpu_to_fs32(sbi, inode->i_atime.tv_sec); raw_inode->i_mtime = cpu_to_fs32(sbi, inode->i_mtime.tv_sec); - raw_inode->i_ctime = cpu_to_fs32(sbi, inode->i_ctime.tv_sec); + raw_inode->i_ctime = cpu_to_fs32(sbi, inode_get_ctime(inode).tv_sec); si = SYSV_I(inode); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 58d7f43a1371..edb94e55de8e 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -183,7 +183,7 @@ static inline int splice_branch(struct inode *inode, *where->p = where->key; write_unlock(&pointers_lock); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); /* had we spliced it onto indirect block? */ if (where->bh) @@ -423,7 +423,7 @@ do_indirects: } n++; } - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); if (IS_SYNC(inode)) sysv_sync_inode (inode); else @@ -449,7 +449,8 @@ int sysv_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct super_block *s = path->dentry->d_sb; - generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry), + stat); stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size); stat->blksize = s->s_blocksize; return 0; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index fcf163fea3ad..d6b73798071b 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -103,7 +103,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir, { struct inode *inode = d_inode(old_dentry); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_link_count(inode); ihold(inode); @@ -161,7 +161,7 @@ static int sysv_unlink(struct inode * dir, struct dentry * dentry) err = sysv_delete_entry(de, page); if (!err) { - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); } unmap_and_put_page(page, de); @@ -230,7 +230,7 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, unmap_and_put_page(new_page, new_de); if (err) goto out_dir; - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); if (dir_de) drop_nlink(new_inode); inode_dec_link_count(new_inode); diff --git a/fs/tracefs/Makefile b/fs/tracefs/Makefile index 7c35a282b484..73c56da8e284 100644 --- a/fs/tracefs/Makefile +++ b/fs/tracefs/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only tracefs-objs := inode.o +tracefs-objs += event_inode.o obj-$(CONFIG_TRACING) += tracefs.o diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c new file mode 100644 index 000000000000..5f1714089884 --- /dev/null +++ b/fs/tracefs/event_inode.c @@ -0,0 +1,905 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * event_inode.c - part of tracefs, a pseudo file system for activating tracing + * + * Copyright (C) 2020-23 VMware Inc, author: Steven Rostedt (VMware) <rostedt@goodmis.org> + * Copyright (C) 2020-23 VMware Inc, author: Ajay Kaher <akaher@vmware.com> + * + * eventfs is used to dynamically create inodes and dentries based on the + * meta data provided by the tracing system. + * + * eventfs stores the meta-data of files/dirs and holds off on creating + * inodes/dentries of the files. When accessed, the eventfs will create the + * inodes/dentries in a just-in-time (JIT) manner. The eventfs will clean up + * and delete the inodes/dentries when they are no longer referenced. + */ +#include <linux/fsnotify.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/workqueue.h> +#include <linux/security.h> +#include <linux/tracefs.h> +#include <linux/kref.h> +#include <linux/delay.h> +#include "internal.h" + +struct eventfs_inode { + struct list_head e_top_files; +}; + +/* + * struct eventfs_file - hold the properties of the eventfs files and + * directories. + * @name: the name of the file or directory to create + * @d_parent: holds parent's dentry + * @dentry: once accessed holds dentry + * @list: file or directory to be added to parent directory + * @ei: list of files and directories within directory + * @fop: file_operations for file or directory + * @iop: inode_operations for file or directory + * @data: something that the caller will want to get to later on + * @mode: the permission that the file or directory should have + */ +struct eventfs_file { + const char *name; + struct dentry *d_parent; + struct dentry *dentry; + struct list_head list; + struct eventfs_inode *ei; + const struct file_operations *fop; + const struct inode_operations *iop; + /* + * Union - used for deletion + * @del_list: list of eventfs_file to delete + * @rcu: eventfs_file to delete in RCU + * @is_freed: node is freed if one of the above is set + */ + union { + struct list_head del_list; + struct rcu_head rcu; + unsigned long is_freed; + }; + void *data; + umode_t mode; +}; + +static DEFINE_MUTEX(eventfs_mutex); +DEFINE_STATIC_SRCU(eventfs_srcu); + +static struct dentry *eventfs_root_lookup(struct inode *dir, + struct dentry *dentry, + unsigned int flags); +static int dcache_dir_open_wrapper(struct inode *inode, struct file *file); +static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx); +static int eventfs_release(struct inode *inode, struct file *file); + +static const struct inode_operations eventfs_root_dir_inode_operations = { + .lookup = eventfs_root_lookup, +}; + +static const struct file_operations eventfs_file_operations = { + .open = dcache_dir_open_wrapper, + .read = generic_read_dir, + .iterate_shared = dcache_readdir_wrapper, + .llseek = generic_file_llseek, + .release = eventfs_release, +}; + +/** + * create_file - create a file in the tracefs filesystem + * @name: the name of the file to create. + * @mode: the permission that the file should have. + * @parent: parent dentry for this file. + * @data: something that the caller will want to get to later on. + * @fop: struct file_operations that should be used for this file. + * + * This is the basic "create a file" function for tracefs. It allows for a + * wide range of flexibility in creating a file. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the tracefs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If tracefs is not enabled in the kernel, the value -%ENODEV will be + * returned. + */ +static struct dentry *create_file(const char *name, umode_t mode, + struct dentry *parent, void *data, + const struct file_operations *fop) +{ + struct tracefs_inode *ti; + struct dentry *dentry; + struct inode *inode; + + if (!(mode & S_IFMT)) + mode |= S_IFREG; + + if (WARN_ON_ONCE(!S_ISREG(mode))) + return NULL; + + dentry = eventfs_start_creating(name, parent); + + if (IS_ERR(dentry)) + return dentry; + + inode = tracefs_get_inode(dentry->d_sb); + if (unlikely(!inode)) + return eventfs_failed_creating(dentry); + + inode->i_mode = mode; + inode->i_fop = fop; + inode->i_private = data; + + ti = get_tracefs(inode); + ti->flags |= TRACEFS_EVENT_INODE; + d_instantiate(dentry, inode); + fsnotify_create(dentry->d_parent->d_inode, dentry); + return eventfs_end_creating(dentry); +}; + +/** + * create_dir - create a dir in the tracefs filesystem + * @name: the name of the file to create. + * @parent: parent dentry for this file. + * @data: something that the caller will want to get to later on. + * + * This is the basic "create a dir" function for eventfs. It allows for a + * wide range of flexibility in creating a dir. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the tracefs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If tracefs is not enabled in the kernel, the value -%ENODEV will be + * returned. + */ +static struct dentry *create_dir(const char *name, struct dentry *parent, void *data) +{ + struct tracefs_inode *ti; + struct dentry *dentry; + struct inode *inode; + + dentry = eventfs_start_creating(name, parent); + if (IS_ERR(dentry)) + return dentry; + + inode = tracefs_get_inode(dentry->d_sb); + if (unlikely(!inode)) + return eventfs_failed_creating(dentry); + + inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + inode->i_op = &eventfs_root_dir_inode_operations; + inode->i_fop = &eventfs_file_operations; + inode->i_private = data; + + ti = get_tracefs(inode); + ti->flags |= TRACEFS_EVENT_INODE; + + inc_nlink(inode); + d_instantiate(dentry, inode); + inc_nlink(dentry->d_parent->d_inode); + fsnotify_mkdir(dentry->d_parent->d_inode, dentry); + return eventfs_end_creating(dentry); +} + +/** + * eventfs_set_ef_status_free - set the ef->status to free + * @ti: the tracefs_inode of the dentry + * @dentry: dentry who's status to be freed + * + * eventfs_set_ef_status_free will be called if no more + * references remain + */ +void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry) +{ + struct tracefs_inode *ti_parent; + struct eventfs_inode *ei; + struct eventfs_file *ef, *tmp; + + /* The top level events directory may be freed by this */ + if (unlikely(ti->flags & TRACEFS_EVENT_TOP_INODE)) { + LIST_HEAD(ef_del_list); + + mutex_lock(&eventfs_mutex); + + ei = ti->private; + + /* Record all the top level files */ + list_for_each_entry_srcu(ef, &ei->e_top_files, list, + lockdep_is_held(&eventfs_mutex)) { + list_add_tail(&ef->del_list, &ef_del_list); + } + + /* Nothing should access this, but just in case! */ + ti->private = NULL; + + mutex_unlock(&eventfs_mutex); + + /* Now safely free the top level files and their children */ + list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) { + list_del(&ef->del_list); + eventfs_remove(ef); + } + + kfree(ei); + return; + } + + mutex_lock(&eventfs_mutex); + + ti_parent = get_tracefs(dentry->d_parent->d_inode); + if (!ti_parent || !(ti_parent->flags & TRACEFS_EVENT_INODE)) + goto out; + + ef = dentry->d_fsdata; + if (!ef) + goto out; + + /* + * If ef was freed, then the LSB bit is set for d_fsdata. + * But this should not happen, as it should still have a + * ref count that prevents it. Warn in case it does. + */ + if (WARN_ON_ONCE((unsigned long)ef & 1)) + goto out; + + dentry->d_fsdata = NULL; + ef->dentry = NULL; +out: + mutex_unlock(&eventfs_mutex); +} + +/** + * eventfs_post_create_dir - post create dir routine + * @ef: eventfs_file of recently created dir + * + * Map the meta-data of files within an eventfs dir to their parent dentry + */ +static void eventfs_post_create_dir(struct eventfs_file *ef) +{ + struct eventfs_file *ef_child; + struct tracefs_inode *ti; + + /* srcu lock already held */ + /* fill parent-child relation */ + list_for_each_entry_srcu(ef_child, &ef->ei->e_top_files, list, + srcu_read_lock_held(&eventfs_srcu)) { + ef_child->d_parent = ef->dentry; + } + + ti = get_tracefs(ef->dentry->d_inode); + ti->private = ef->ei; +} + +/** + * create_dentry - helper function to create dentry + * @ef: eventfs_file of file or directory to create + * @parent: parent dentry + * @lookup: true if called from lookup routine + * + * Used to create a dentry for file/dir, executes post dentry creation routine + */ +static struct dentry * +create_dentry(struct eventfs_file *ef, struct dentry *parent, bool lookup) +{ + bool invalidate = false; + struct dentry *dentry; + + mutex_lock(&eventfs_mutex); + if (ef->is_freed) { + mutex_unlock(&eventfs_mutex); + return NULL; + } + if (ef->dentry) { + dentry = ef->dentry; + /* On dir open, up the ref count */ + if (!lookup) + dget(dentry); + mutex_unlock(&eventfs_mutex); + return dentry; + } + mutex_unlock(&eventfs_mutex); + + if (!lookup) + inode_lock(parent->d_inode); + + if (ef->ei) + dentry = create_dir(ef->name, parent, ef->data); + else + dentry = create_file(ef->name, ef->mode, parent, + ef->data, ef->fop); + + if (!lookup) + inode_unlock(parent->d_inode); + + mutex_lock(&eventfs_mutex); + if (IS_ERR_OR_NULL(dentry)) { + /* If the ef was already updated get it */ + dentry = ef->dentry; + if (dentry && !lookup) + dget(dentry); + mutex_unlock(&eventfs_mutex); + return dentry; + } + + if (!ef->dentry && !ef->is_freed) { + ef->dentry = dentry; + if (ef->ei) + eventfs_post_create_dir(ef); + dentry->d_fsdata = ef; + } else { + /* A race here, should try again (unless freed) */ + invalidate = true; + + /* + * Should never happen unless we get here due to being freed. + * Otherwise it means two dentries exist with the same name. + */ + WARN_ON_ONCE(!ef->is_freed); + } + mutex_unlock(&eventfs_mutex); + if (invalidate) + d_invalidate(dentry); + + if (lookup || invalidate) + dput(dentry); + + return invalidate ? NULL : dentry; +} + +static bool match_event_file(struct eventfs_file *ef, const char *name) +{ + bool ret; + + mutex_lock(&eventfs_mutex); + ret = !ef->is_freed && strcmp(ef->name, name) == 0; + mutex_unlock(&eventfs_mutex); + + return ret; +} + +/** + * eventfs_root_lookup - lookup routine to create file/dir + * @dir: in which a lookup is being done + * @dentry: file/dir dentry + * @flags: to pass as flags parameter to simple lookup + * + * Used to create a dynamic file/dir within @dir. Use the eventfs_inode + * list of meta data to find the information needed to create the file/dir. + */ +static struct dentry *eventfs_root_lookup(struct inode *dir, + struct dentry *dentry, + unsigned int flags) +{ + struct tracefs_inode *ti; + struct eventfs_inode *ei; + struct eventfs_file *ef; + struct dentry *ret = NULL; + int idx; + + ti = get_tracefs(dir); + if (!(ti->flags & TRACEFS_EVENT_INODE)) + return NULL; + + ei = ti->private; + idx = srcu_read_lock(&eventfs_srcu); + list_for_each_entry_srcu(ef, &ei->e_top_files, list, + srcu_read_lock_held(&eventfs_srcu)) { + if (!match_event_file(ef, dentry->d_name.name)) + continue; + ret = simple_lookup(dir, dentry, flags); + create_dentry(ef, ef->d_parent, true); + break; + } + srcu_read_unlock(&eventfs_srcu, idx); + return ret; +} + +struct dentry_list { + void *cursor; + struct dentry **dentries; +}; + +/** + * eventfs_release - called to release eventfs file/dir + * @inode: inode to be released + * @file: file to be released (not used) + */ +static int eventfs_release(struct inode *inode, struct file *file) +{ + struct tracefs_inode *ti; + struct dentry_list *dlist = file->private_data; + void *cursor; + int i; + + ti = get_tracefs(inode); + if (!(ti->flags & TRACEFS_EVENT_INODE)) + return -EINVAL; + + if (WARN_ON_ONCE(!dlist)) + return -EINVAL; + + for (i = 0; dlist->dentries[i]; i++) { + dput(dlist->dentries[i]); + } + + cursor = dlist->cursor; + kfree(dlist->dentries); + kfree(dlist); + file->private_data = cursor; + return dcache_dir_close(inode, file); +} + +/** + * dcache_dir_open_wrapper - eventfs open wrapper + * @inode: not used + * @file: dir to be opened (to create its child) + * + * Used to dynamically create the file/dir within @file. @file is really a + * directory and all the files/dirs of the children within @file will be + * created. If any of the files/dirs have already been created, their + * reference count will be incremented. + */ +static int dcache_dir_open_wrapper(struct inode *inode, struct file *file) +{ + struct tracefs_inode *ti; + struct eventfs_inode *ei; + struct eventfs_file *ef; + struct dentry_list *dlist; + struct dentry **dentries = NULL; + struct dentry *dentry = file_dentry(file); + struct dentry *d; + struct inode *f_inode = file_inode(file); + int cnt = 0; + int idx; + int ret; + + ti = get_tracefs(f_inode); + if (!(ti->flags & TRACEFS_EVENT_INODE)) + return -EINVAL; + + if (WARN_ON_ONCE(file->private_data)) + return -EINVAL; + + dlist = kmalloc(sizeof(*dlist), GFP_KERNEL); + if (!dlist) + return -ENOMEM; + + ei = ti->private; + idx = srcu_read_lock(&eventfs_srcu); + list_for_each_entry_srcu(ef, &ei->e_top_files, list, + srcu_read_lock_held(&eventfs_srcu)) { + d = create_dentry(ef, dentry, false); + if (d) { + struct dentry **tmp; + + tmp = krealloc(dentries, sizeof(d) * (cnt + 2), GFP_KERNEL); + if (!tmp) + break; + tmp[cnt] = d; + tmp[cnt + 1] = NULL; + cnt++; + dentries = tmp; + } + } + srcu_read_unlock(&eventfs_srcu, idx); + ret = dcache_dir_open(inode, file); + + /* + * dcache_dir_open() sets file->private_data to a dentry cursor. + * Need to save that but also save all the dentries that were + * opened by this function. + */ + dlist->cursor = file->private_data; + dlist->dentries = dentries; + file->private_data = dlist; + return ret; +} + +/* + * This just sets the file->private_data back to the cursor and back. + */ +static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx) +{ + struct dentry_list *dlist = file->private_data; + int ret; + + file->private_data = dlist->cursor; + ret = dcache_readdir(file, ctx); + dlist->cursor = file->private_data; + file->private_data = dlist; + return ret; +} + +/** + * eventfs_prepare_ef - helper function to prepare eventfs_file + * @name: the name of the file/directory to create. + * @mode: the permission that the file should have. + * @fop: struct file_operations that should be used for this file/directory. + * @iop: struct inode_operations that should be used for this file/directory. + * @data: something that the caller will want to get to later on. The + * inode.i_private pointer will point to this value on the open() call. + * + * This function allocates and fills the eventfs_file structure. + */ +static struct eventfs_file *eventfs_prepare_ef(const char *name, umode_t mode, + const struct file_operations *fop, + const struct inode_operations *iop, + void *data) +{ + struct eventfs_file *ef; + + ef = kzalloc(sizeof(*ef), GFP_KERNEL); + if (!ef) + return ERR_PTR(-ENOMEM); + + ef->name = kstrdup(name, GFP_KERNEL); + if (!ef->name) { + kfree(ef); + return ERR_PTR(-ENOMEM); + } + + if (S_ISDIR(mode)) { + ef->ei = kzalloc(sizeof(*ef->ei), GFP_KERNEL); + if (!ef->ei) { + kfree(ef->name); + kfree(ef); + return ERR_PTR(-ENOMEM); + } + INIT_LIST_HEAD(&ef->ei->e_top_files); + } else { + ef->ei = NULL; + } + + ef->iop = iop; + ef->fop = fop; + ef->mode = mode; + ef->data = data; + return ef; +} + +/** + * eventfs_create_events_dir - create the trace event structure + * @name: the name of the directory to create. + * @parent: parent dentry for this file. This should be a directory dentry + * if set. If this parameter is NULL, then the directory will be + * created in the root of the tracefs filesystem. + * + * This function creates the top of the trace event directory. + */ +struct dentry *eventfs_create_events_dir(const char *name, + struct dentry *parent) +{ + struct dentry *dentry = tracefs_start_creating(name, parent); + struct eventfs_inode *ei; + struct tracefs_inode *ti; + struct inode *inode; + + if (security_locked_down(LOCKDOWN_TRACEFS)) + return NULL; + + if (IS_ERR(dentry)) + return dentry; + + ei = kzalloc(sizeof(*ei), GFP_KERNEL); + if (!ei) + return ERR_PTR(-ENOMEM); + inode = tracefs_get_inode(dentry->d_sb); + if (unlikely(!inode)) { + kfree(ei); + tracefs_failed_creating(dentry); + return ERR_PTR(-ENOMEM); + } + + INIT_LIST_HEAD(&ei->e_top_files); + + ti = get_tracefs(inode); + ti->flags |= TRACEFS_EVENT_INODE | TRACEFS_EVENT_TOP_INODE; + ti->private = ei; + + inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + inode->i_op = &eventfs_root_dir_inode_operations; + inode->i_fop = &eventfs_file_operations; + + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + d_instantiate(dentry, inode); + inc_nlink(dentry->d_parent->d_inode); + fsnotify_mkdir(dentry->d_parent->d_inode, dentry); + return tracefs_end_creating(dentry); +} + +/** + * eventfs_add_subsystem_dir - add eventfs subsystem_dir to list to create later + * @name: the name of the file to create. + * @parent: parent dentry for this dir. + * + * This function adds eventfs subsystem dir to list. + * And all these dirs are created on the fly when they are looked up, + * and the dentry and inodes will be removed when they are done. + */ +struct eventfs_file *eventfs_add_subsystem_dir(const char *name, + struct dentry *parent) +{ + struct tracefs_inode *ti_parent; + struct eventfs_inode *ei_parent; + struct eventfs_file *ef; + + if (security_locked_down(LOCKDOWN_TRACEFS)) + return NULL; + + if (!parent) + return ERR_PTR(-EINVAL); + + ti_parent = get_tracefs(parent->d_inode); + ei_parent = ti_parent->private; + + ef = eventfs_prepare_ef(name, S_IFDIR, NULL, NULL, NULL); + if (IS_ERR(ef)) + return ef; + + mutex_lock(&eventfs_mutex); + list_add_tail(&ef->list, &ei_parent->e_top_files); + ef->d_parent = parent; + mutex_unlock(&eventfs_mutex); + return ef; +} + +/** + * eventfs_add_dir - add eventfs dir to list to create later + * @name: the name of the file to create. + * @ef_parent: parent eventfs_file for this dir. + * + * This function adds eventfs dir to list. + * And all these dirs are created on the fly when they are looked up, + * and the dentry and inodes will be removed when they are done. + */ +struct eventfs_file *eventfs_add_dir(const char *name, + struct eventfs_file *ef_parent) +{ + struct eventfs_file *ef; + + if (security_locked_down(LOCKDOWN_TRACEFS)) + return NULL; + + if (!ef_parent) + return ERR_PTR(-EINVAL); + + ef = eventfs_prepare_ef(name, S_IFDIR, NULL, NULL, NULL); + if (IS_ERR(ef)) + return ef; + + mutex_lock(&eventfs_mutex); + list_add_tail(&ef->list, &ef_parent->ei->e_top_files); + ef->d_parent = ef_parent->dentry; + mutex_unlock(&eventfs_mutex); + return ef; +} + +/** + * eventfs_add_events_file - add the data needed to create a file for later reference + * @name: the name of the file to create. + * @mode: the permission that the file should have. + * @parent: parent dentry for this file. + * @data: something that the caller will want to get to later on. + * @fop: struct file_operations that should be used for this file. + * + * This function is used to add the information needed to create a + * dentry/inode within the top level events directory. The file created + * will have the @mode permissions. The @data will be used to fill the + * inode.i_private when the open() call is done. The dentry and inodes are + * all created when they are referenced, and removed when they are no + * longer referenced. + */ +int eventfs_add_events_file(const char *name, umode_t mode, + struct dentry *parent, void *data, + const struct file_operations *fop) +{ + struct tracefs_inode *ti; + struct eventfs_inode *ei; + struct eventfs_file *ef; + + if (security_locked_down(LOCKDOWN_TRACEFS)) + return -ENODEV; + + if (!parent) + return -EINVAL; + + if (!(mode & S_IFMT)) + mode |= S_IFREG; + + if (!parent->d_inode) + return -EINVAL; + + ti = get_tracefs(parent->d_inode); + if (!(ti->flags & TRACEFS_EVENT_INODE)) + return -EINVAL; + + ei = ti->private; + ef = eventfs_prepare_ef(name, mode, fop, NULL, data); + + if (IS_ERR(ef)) + return -ENOMEM; + + mutex_lock(&eventfs_mutex); + list_add_tail(&ef->list, &ei->e_top_files); + ef->d_parent = parent; + mutex_unlock(&eventfs_mutex); + return 0; +} + +/** + * eventfs_add_file - add eventfs file to list to create later + * @name: the name of the file to create. + * @mode: the permission that the file should have. + * @ef_parent: parent eventfs_file for this file. + * @data: something that the caller will want to get to later on. + * @fop: struct file_operations that should be used for this file. + * + * This function is used to add the information needed to create a + * file within a subdirectory of the events directory. The file created + * will have the @mode permissions. The @data will be used to fill the + * inode.i_private when the open() call is done. The dentry and inodes are + * all created when they are referenced, and removed when they are no + * longer referenced. + */ +int eventfs_add_file(const char *name, umode_t mode, + struct eventfs_file *ef_parent, + void *data, + const struct file_operations *fop) +{ + struct eventfs_file *ef; + + if (security_locked_down(LOCKDOWN_TRACEFS)) + return -ENODEV; + + if (!ef_parent) + return -EINVAL; + + if (!(mode & S_IFMT)) + mode |= S_IFREG; + + ef = eventfs_prepare_ef(name, mode, fop, NULL, data); + if (IS_ERR(ef)) + return -ENOMEM; + + mutex_lock(&eventfs_mutex); + list_add_tail(&ef->list, &ef_parent->ei->e_top_files); + ef->d_parent = ef_parent->dentry; + mutex_unlock(&eventfs_mutex); + return 0; +} + +static void free_ef(struct rcu_head *head) +{ + struct eventfs_file *ef = container_of(head, struct eventfs_file, rcu); + + kfree(ef->name); + kfree(ef->ei); + kfree(ef); +} + +/** + * eventfs_remove_rec - remove eventfs dir or file from list + * @ef: eventfs_file to be removed. + * @head: to create list of eventfs_file to be deleted + * @level: to check recursion depth + * + * The helper function eventfs_remove_rec() is used to clean up and free the + * associated data from eventfs for both of the added functions. + */ +static void eventfs_remove_rec(struct eventfs_file *ef, struct list_head *head, int level) +{ + struct eventfs_file *ef_child; + + if (!ef) + return; + /* + * Check recursion depth. It should never be greater than 3: + * 0 - events/ + * 1 - events/group/ + * 2 - events/group/event/ + * 3 - events/group/event/file + */ + if (WARN_ON_ONCE(level > 3)) + return; + + if (ef->ei) { + /* search for nested folders or files */ + list_for_each_entry_srcu(ef_child, &ef->ei->e_top_files, list, + lockdep_is_held(&eventfs_mutex)) { + eventfs_remove_rec(ef_child, head, level + 1); + } + } + + list_del_rcu(&ef->list); + list_add_tail(&ef->del_list, head); +} + +/** + * eventfs_remove - remove eventfs dir or file from list + * @ef: eventfs_file to be removed. + * + * This function acquire the eventfs_mutex lock and call eventfs_remove_rec() + */ +void eventfs_remove(struct eventfs_file *ef) +{ + struct eventfs_file *tmp; + LIST_HEAD(ef_del_list); + struct dentry *dentry_list = NULL; + struct dentry *dentry; + + if (!ef) + return; + + mutex_lock(&eventfs_mutex); + eventfs_remove_rec(ef, &ef_del_list, 0); + list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) { + if (ef->dentry) { + unsigned long ptr = (unsigned long)dentry_list; + + /* Keep the dentry from being freed yet */ + dget(ef->dentry); + + /* + * Paranoid: The dget() above should prevent the dentry + * from being freed and calling eventfs_set_ef_status_free(). + * But just in case, set the link list LSB pointer to 1 + * and have eventfs_set_ef_status_free() check that to + * make sure that if it does happen, it will not think + * the d_fsdata is an event_file. + * + * For this to work, no event_file should be allocated + * on a odd space, as the ef should always be allocated + * to be at least word aligned. Check for that too. + */ + WARN_ON_ONCE(ptr & 1); + + ef->dentry->d_fsdata = (void *)(ptr | 1); + dentry_list = ef->dentry; + ef->dentry = NULL; + } + call_srcu(&eventfs_srcu, &ef->rcu, free_ef); + } + mutex_unlock(&eventfs_mutex); + + while (dentry_list) { + unsigned long ptr; + + dentry = dentry_list; + ptr = (unsigned long)dentry->d_fsdata & ~1UL; + dentry_list = (struct dentry *)ptr; + dentry->d_fsdata = NULL; + d_invalidate(dentry); + mutex_lock(&eventfs_mutex); + /* dentry should now have at least a single reference */ + WARN_ONCE((int)d_count(dentry) < 1, + "dentry %p less than one reference (%d) after invalidate\n", + dentry, d_count(dentry)); + mutex_unlock(&eventfs_mutex); + dput(dentry); + } +} + +/** + * eventfs_remove_events_dir - remove eventfs dir or file from list + * @dentry: events's dentry to be removed. + * + * This function remove events main directory + */ +void eventfs_remove_events_dir(struct dentry *dentry) +{ + struct tracefs_inode *ti; + + if (!dentry || !dentry->d_inode) + return; + + ti = get_tracefs(dentry->d_inode); + if (!ti || !(ti->flags & TRACEFS_EVENT_INODE)) + return; + + d_invalidate(dentry); + dput(dentry); +} diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 57ac8aa4a724..891653ba9cf3 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -21,13 +21,33 @@ #include <linux/parser.h> #include <linux/magic.h> #include <linux/slab.h> +#include "internal.h" #define TRACEFS_DEFAULT_MODE 0700 +static struct kmem_cache *tracefs_inode_cachep __ro_after_init; static struct vfsmount *tracefs_mount; static int tracefs_mount_count; static bool tracefs_registered; +static struct inode *tracefs_alloc_inode(struct super_block *sb) +{ + struct tracefs_inode *ti; + + ti = kmem_cache_alloc(tracefs_inode_cachep, GFP_KERNEL); + if (!ti) + return NULL; + + ti->flags = 0; + + return &ti->vfs_inode; +} + +static void tracefs_free_inode(struct inode *inode) +{ + kmem_cache_free(tracefs_inode_cachep, get_tracefs(inode)); +} + static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -127,12 +147,12 @@ static const struct inode_operations tracefs_dir_inode_operations = { .rmdir = tracefs_syscall_rmdir, }; -static struct inode *tracefs_get_inode(struct super_block *sb) +struct inode *tracefs_get_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); } return inode; } @@ -290,6 +310,7 @@ static int tracefs_apply_options(struct super_block *sb, bool remount) struct tracefs_fs_info *fsi = sb->s_fs_info; struct inode *inode = d_inode(sb->s_root); struct tracefs_mount_opts *opts = &fsi->mount_opts; + umode_t tmp_mode; /* * On remount, only reset mode/uid/gid if they were provided as mount @@ -297,8 +318,9 @@ static int tracefs_apply_options(struct super_block *sb, bool remount) */ if (!remount || opts->opts & BIT(Opt_mode)) { - inode->i_mode &= ~S_IALLUGO; - inode->i_mode |= opts->mode; + tmp_mode = READ_ONCE(inode->i_mode) & ~S_IALLUGO; + tmp_mode |= opts->mode; + WRITE_ONCE(inode->i_mode, tmp_mode); } if (!remount || opts->opts & BIT(Opt_uid)) @@ -346,11 +368,31 @@ static int tracefs_show_options(struct seq_file *m, struct dentry *root) } static const struct super_operations tracefs_super_operations = { + .alloc_inode = tracefs_alloc_inode, + .free_inode = tracefs_free_inode, + .drop_inode = generic_delete_inode, .statfs = simple_statfs, .remount_fs = tracefs_remount, .show_options = tracefs_show_options, }; +static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode) +{ + struct tracefs_inode *ti; + + if (!dentry || !inode) + return; + + ti = get_tracefs(inode); + if (ti && ti->flags & TRACEFS_EVENT_INODE) + eventfs_set_ef_status_free(ti, dentry); + iput(inode); +} + +static const struct dentry_operations tracefs_dentry_operations = { + .d_iput = tracefs_dentry_iput, +}; + static int trace_fill_super(struct super_block *sb, void *data, int silent) { static const struct tree_descr trace_files[] = {{""}}; @@ -373,6 +415,7 @@ static int trace_fill_super(struct super_block *sb, void *data, int silent) goto fail; sb->s_op = &tracefs_super_operations; + sb->s_d_op = &tracefs_dentry_operations; tracefs_apply_options(sb, false); @@ -399,7 +442,7 @@ static struct file_system_type trace_fs_type = { }; MODULE_ALIAS_FS("tracefs"); -static struct dentry *start_creating(const char *name, struct dentry *parent) +struct dentry *tracefs_start_creating(const char *name, struct dentry *parent) { struct dentry *dentry; int error; @@ -437,7 +480,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) return dentry; } -static struct dentry *failed_creating(struct dentry *dentry) +struct dentry *tracefs_failed_creating(struct dentry *dentry) { inode_unlock(d_inode(dentry->d_parent)); dput(dentry); @@ -445,13 +488,87 @@ static struct dentry *failed_creating(struct dentry *dentry) return NULL; } -static struct dentry *end_creating(struct dentry *dentry) +struct dentry *tracefs_end_creating(struct dentry *dentry) { inode_unlock(d_inode(dentry->d_parent)); return dentry; } /** + * eventfs_start_creating - start the process of creating a dentry + * @name: Name of the file created for the dentry + * @parent: The parent dentry where this dentry will be created + * + * This is a simple helper function for the dynamically created eventfs + * files. When the directory of the eventfs files are accessed, their + * dentries are created on the fly. This function is used to start that + * process. + */ +struct dentry *eventfs_start_creating(const char *name, struct dentry *parent) +{ + struct dentry *dentry; + int error; + + error = simple_pin_fs(&trace_fs_type, &tracefs_mount, + &tracefs_mount_count); + if (error) + return ERR_PTR(error); + + /* + * If the parent is not specified, we create it in the root. + * We need the root dentry to do this, which is in the super + * block. A pointer to that is in the struct vfsmount that we + * have around. + */ + if (!parent) + parent = tracefs_mount->mnt_root; + + if (unlikely(IS_DEADDIR(parent->d_inode))) + dentry = ERR_PTR(-ENOENT); + else + dentry = lookup_one_len(name, parent, strlen(name)); + + if (!IS_ERR(dentry) && dentry->d_inode) { + dput(dentry); + dentry = ERR_PTR(-EEXIST); + } + + if (IS_ERR(dentry)) + simple_release_fs(&tracefs_mount, &tracefs_mount_count); + + return dentry; +} + +/** + * eventfs_failed_creating - clean up a failed eventfs dentry creation + * @dentry: The dentry to clean up + * + * If after calling eventfs_start_creating(), a failure is detected, the + * resources created by eventfs_start_creating() needs to be cleaned up. In + * that case, this function should be called to perform that clean up. + */ +struct dentry *eventfs_failed_creating(struct dentry *dentry) +{ + dput(dentry); + simple_release_fs(&tracefs_mount, &tracefs_mount_count); + return NULL; +} + +/** + * eventfs_end_creating - Finish the process of creating a eventfs dentry + * @dentry: The dentry that has successfully been created. + * + * This function is currently just a place holder to match + * eventfs_start_creating(). In case any synchronization needs to be added, + * this function will be used to implement that without having to modify + * the callers of eventfs_start_creating(). + */ +struct dentry *eventfs_end_creating(struct dentry *dentry) +{ + return dentry; +} + +/** * tracefs_create_file - create a file in the tracefs filesystem * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have. @@ -490,14 +607,14 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, if (!(mode & S_IFMT)) mode |= S_IFREG; BUG_ON(!S_ISREG(mode)); - dentry = start_creating(name, parent); + dentry = tracefs_start_creating(name, parent); if (IS_ERR(dentry)) return NULL; inode = tracefs_get_inode(dentry->d_sb); if (unlikely(!inode)) - return failed_creating(dentry); + return tracefs_failed_creating(dentry); inode->i_mode = mode; inode->i_fop = fops ? fops : &tracefs_file_operations; @@ -506,13 +623,13 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, inode->i_gid = d_inode(dentry->d_parent)->i_gid; d_instantiate(dentry, inode); fsnotify_create(d_inode(dentry->d_parent), dentry); - return end_creating(dentry); + return tracefs_end_creating(dentry); } static struct dentry *__create_dir(const char *name, struct dentry *parent, const struct inode_operations *ops) { - struct dentry *dentry = start_creating(name, parent); + struct dentry *dentry = tracefs_start_creating(name, parent); struct inode *inode; if (IS_ERR(dentry)) @@ -520,7 +637,7 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, inode = tracefs_get_inode(dentry->d_sb); if (unlikely(!inode)) - return failed_creating(dentry); + return tracefs_failed_creating(dentry); /* Do not set bits for OTH */ inode->i_mode = S_IFDIR | S_IRWXU | S_IRUSR| S_IRGRP | S_IXUSR | S_IXGRP; @@ -534,7 +651,7 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, d_instantiate(dentry, inode); inc_nlink(d_inode(dentry->d_parent)); fsnotify_mkdir(d_inode(dentry->d_parent), dentry); - return end_creating(dentry); + return tracefs_end_creating(dentry); } /** @@ -556,6 +673,9 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, */ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent) { + if (security_locked_down(LOCKDOWN_TRACEFS)) + return NULL; + return __create_dir(name, parent, &simple_dir_inode_operations); } @@ -628,10 +748,26 @@ bool tracefs_initialized(void) return tracefs_registered; } +static void init_once(void *foo) +{ + struct tracefs_inode *ti = (struct tracefs_inode *) foo; + + inode_init_once(&ti->vfs_inode); +} + static int __init tracefs_init(void) { int retval; + tracefs_inode_cachep = kmem_cache_create("tracefs_inode_cache", + sizeof(struct tracefs_inode), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD| + SLAB_ACCOUNT), + init_once); + if (!tracefs_inode_cachep) + return -ENOMEM; + retval = sysfs_create_mount_point(kernel_kobj, "tracing"); if (retval) return -EINVAL; diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h new file mode 100644 index 000000000000..4f2e49e2197b --- /dev/null +++ b/fs/tracefs/internal.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TRACEFS_INTERNAL_H +#define _TRACEFS_INTERNAL_H + +enum { + TRACEFS_EVENT_INODE = BIT(1), + TRACEFS_EVENT_TOP_INODE = BIT(2), +}; + +struct tracefs_inode { + unsigned long flags; + void *private; + struct inode vfs_inode; +}; + +static inline struct tracefs_inode *get_tracefs(const struct inode *inode) +{ + return container_of(inode, struct tracefs_inode, vfs_inode); +} + +struct dentry *tracefs_start_creating(const char *name, struct dentry *parent); +struct dentry *tracefs_end_creating(struct dentry *dentry); +struct dentry *tracefs_failed_creating(struct dentry *dentry); +struct inode *tracefs_get_inode(struct super_block *sb); +struct dentry *eventfs_start_creating(const char *name, struct dentry *parent); +struct dentry *eventfs_failed_creating(struct dentry *dentry); +struct dentry *eventfs_end_creating(struct dentry *dentry); +void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry); + +#endif /* _TRACEFS_INTERNAL_H */ diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 9c9d3f0e36a4..eef9e527d9ff 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -243,8 +243,8 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) (unsigned int)inode->i_mtime.tv_sec, (unsigned int)inode->i_mtime.tv_nsec); pr_err("\tctime %u.%u\n", - (unsigned int)inode->i_ctime.tv_sec, - (unsigned int)inode->i_ctime.tv_nsec); + (unsigned int) inode_get_ctime(inode).tv_sec, + (unsigned int) inode_get_ctime(inode).tv_nsec); pr_err("\tcreat_sqnum %llu\n", ui->creat_sqnum); pr_err("\txattr_size %u\n", ui->xattr_size); pr_err("\txattr_cnt %u\n", ui->xattr_cnt); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index ef0499edc248..2f48c58d47cd 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -96,8 +96,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, inode->i_flags |= S_NOCMTIME; inode_init_owner(&nop_mnt_idmap, inode, dir, mode); - inode->i_mtime = inode->i_atime = inode->i_ctime = - current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_mapping->nrpages = 0; if (!is_xattr) { @@ -325,7 +324,7 @@ static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir, mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = dir->i_ctime = inode->i_ctime; + dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode)); err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; @@ -765,10 +764,10 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, inc_nlink(inode); ihold(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = dir->i_ctime = inode->i_ctime; + dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode)); err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; @@ -838,11 +837,11 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) } lock_2_inodes(dir, inode); - inode->i_ctime = current_time(dir); + inode_set_ctime_current(inode); drop_nlink(inode); dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = dir->i_ctime = inode->i_ctime; + dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode)); err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); if (err) goto out_cancel; @@ -940,12 +939,12 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) } lock_2_inodes(dir, inode); - inode->i_ctime = current_time(dir); + inode_set_ctime_current(inode); clear_nlink(inode); drop_nlink(dir); dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = dir->i_ctime = inode->i_ctime; + dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode)); err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); if (err) goto out_cancel; @@ -1019,7 +1018,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, inc_nlink(dir); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = dir->i_ctime = inode->i_ctime; + dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode)); err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) { ubifs_err(c, "cannot create directory, error %d", err); @@ -1110,7 +1109,7 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir, mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = dir->i_ctime = inode->i_ctime; + dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode)); err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; @@ -1210,7 +1209,7 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir, mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = dir->i_ctime = inode->i_ctime; + dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode)); err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; @@ -1298,7 +1297,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, struct ubifs_budget_req ino_req = { .dirtied_ino = 1, .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; struct ubifs_budget_req wht_req; - struct timespec64 time; unsigned int saved_nlink; struct fscrypt_name old_nm, new_nm; @@ -1414,8 +1412,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, * Like most other Unix systems, set the @i_ctime for inodes on a * rename. */ - time = current_time(old_dir); - old_inode->i_ctime = time; + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); /* We must adjust parent link count when renaming directories */ if (is_dir) { @@ -1444,13 +1441,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, old_dir->i_size -= old_sz; ubifs_inode(old_dir)->ui_size = old_dir->i_size; - old_dir->i_mtime = old_dir->i_ctime = time; - new_dir->i_mtime = new_dir->i_ctime = time; /* * And finally, if we unlinked a direntry which happened to have the * same name as the moved direntry, we have to decrement @i_nlink of - * the unlinked inode and change its ctime. + * the unlinked inode. */ if (unlink) { /* @@ -1462,7 +1457,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, clear_nlink(new_inode); else drop_nlink(new_inode); - new_inode->i_ctime = time; } else { new_dir->i_size += new_sz; ubifs_inode(new_dir)->ui_size = new_dir->i_size; @@ -1557,7 +1551,6 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, int sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir); struct inode *fst_inode = d_inode(old_dentry); struct inode *snd_inode = d_inode(new_dentry); - struct timespec64 time; int err; struct fscrypt_name fst_nm, snd_nm; @@ -1588,11 +1581,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, lock_4_inodes(old_dir, new_dir, NULL, NULL); - time = current_time(old_dir); - fst_inode->i_ctime = time; - snd_inode->i_ctime = time; - old_dir->i_mtime = old_dir->i_ctime = time; - new_dir->i_mtime = new_dir->i_ctime = time; + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); if (old_dir != new_dir) { if (S_ISDIR(fst_inode->i_mode) && !S_ISDIR(snd_inode->i_mode)) { @@ -1665,7 +1654,7 @@ int ubifs_getattr(struct mnt_idmap *idmap, const struct path *path, STATX_ATTR_ENCRYPTED | STATX_ATTR_IMMUTABLE); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->blksize = UBIFS_BLOCK_SIZE; stat->size = ui->ui_size; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 6738fe43040b..e5382f0b2587 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1092,7 +1092,7 @@ static void do_attr_changes(struct inode *inode, const struct iattr *attr) if (attr->ia_valid & ATTR_MTIME) inode->i_mtime = attr->ia_mtime; if (attr->ia_valid & ATTR_CTIME) - inode->i_ctime = attr->ia_ctime; + inode_set_ctime_to_ts(inode, attr->ia_ctime); if (attr->ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; @@ -1192,7 +1192,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, mutex_lock(&ui->ui_mutex); ui->ui_size = inode->i_size; /* Truncation changes inode [mc]time */ - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); /* Other attributes may be changed at the same time as well */ do_attr_changes(inode, attr); err = ubifs_jnl_truncate(c, inode, old_size, new_size); @@ -1239,7 +1239,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, mutex_lock(&ui->ui_mutex); if (attr->ia_valid & ATTR_SIZE) { /* Truncation changes inode [mc]time */ - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); /* 'truncate_setsize()' changed @i_size, update @ui_size */ ui->ui_size = inode->i_size; } @@ -1364,8 +1364,10 @@ out: static inline int mctime_update_needed(const struct inode *inode, const struct timespec64 *now) { + struct timespec64 ctime = inode_get_ctime(inode); + if (!timespec64_equal(&inode->i_mtime, now) || - !timespec64_equal(&inode->i_ctime, now)) + !timespec64_equal(&ctime, now)) return 1; return 0; } @@ -1376,8 +1378,7 @@ static inline int mctime_update_needed(const struct inode *inode, * * This function updates time of the inode. */ -int ubifs_update_time(struct inode *inode, struct timespec64 *time, - int flags) +int ubifs_update_time(struct inode *inode, int flags) { struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -1385,21 +1386,17 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time, .dirtied_ino_d = ALIGN(ui->data_len, 8) }; int err, release; - if (!IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT)) - return generic_update_time(inode, time, flags); + if (!IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT)) { + generic_update_time(inode, flags); + return 0; + } err = ubifs_budget_space(c, &req); if (err) return err; mutex_lock(&ui->ui_mutex); - if (flags & S_ATIME) - inode->i_atime = *time; - if (flags & S_CTIME) - inode->i_ctime = *time; - if (flags & S_MTIME) - inode->i_mtime = *time; - + inode_update_timestamps(inode, flags); release = ui->dirty; __mark_inode_dirty(inode, I_DIRTY_SYNC); mutex_unlock(&ui->ui_mutex); @@ -1432,7 +1429,7 @@ static int update_mctime(struct inode *inode) return err; mutex_lock(&ui->ui_mutex); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); release = ui->dirty; mark_inode_dirty_sync(inode); mutex_unlock(&ui->ui_mutex); @@ -1570,7 +1567,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) struct ubifs_inode *ui = ubifs_inode(inode); mutex_lock(&ui->ui_mutex); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); release = ui->dirty; mark_inode_dirty_sync(inode); mutex_unlock(&ui->ui_mutex); diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 67c5108abd89..d79cabe193c3 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -118,7 +118,7 @@ static int setflags(struct inode *inode, int flags) ui->flags &= ~ioctl2ubifs(UBIFS_SETTABLE_IOCTL_FLAGS); ui->flags |= ioctl2ubifs(flags); ubifs_set_inode_flags(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); release = ui->dirty; mark_inode_dirty_sync(inode); mutex_unlock(&ui->ui_mutex); diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index dc52ac0f4a34..ffc9beee7be6 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -454,8 +454,8 @@ static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino, ino->creat_sqnum = cpu_to_le64(ui->creat_sqnum); ino->atime_sec = cpu_to_le64(inode->i_atime.tv_sec); ino->atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); - ino->ctime_sec = cpu_to_le64(inode->i_ctime.tv_sec); - ino->ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ino->ctime_sec = cpu_to_le64(inode_get_ctime(inode).tv_sec); + ino->ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); ino->mtime_sec = cpu_to_le64(inode->i_mtime.tv_sec); ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); ino->uid = cpu_to_le32(i_uid_read(inode)); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 32cb14759796..b08fb28d16b5 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -146,8 +146,8 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec); inode->i_mtime.tv_sec = (int64_t)le64_to_cpu(ino->mtime_sec); inode->i_mtime.tv_nsec = le32_to_cpu(ino->mtime_nsec); - inode->i_ctime.tv_sec = (int64_t)le64_to_cpu(ino->ctime_sec); - inode->i_ctime.tv_nsec = le32_to_cpu(ino->ctime_nsec); + inode_set_ctime(inode, (int64_t)le64_to_cpu(ino->ctime_sec), + le32_to_cpu(ino->ctime_nsec)); inode->i_mode = le32_to_cpu(ino->mode); inode->i_size = le64_to_cpu(ino->size); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 4c36044140e7..ebb3ad6b5e7e 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -2027,7 +2027,7 @@ int ubifs_calc_dark(const struct ubifs_info *c, int spc); int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync); int ubifs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); -int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags); +int ubifs_update_time(struct inode *inode, int flags); /* dir.c */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 349228dd1191..406c82eab513 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -134,7 +134,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, ui->data_len = size; mutex_lock(&host_ui->ui_mutex); - host->i_ctime = current_time(host); + inode_set_ctime_current(host); host_ui->xattr_cnt += 1; host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm)); host_ui->xattr_size += CALC_XATTR_BYTES(size); @@ -215,7 +215,7 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, ui->data_len = size; mutex_lock(&host_ui->ui_mutex); - host->i_ctime = current_time(host); + inode_set_ctime_current(host); host_ui->xattr_size -= CALC_XATTR_BYTES(old_size); host_ui->xattr_size += CALC_XATTR_BYTES(size); @@ -474,7 +474,7 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host, return err; mutex_lock(&host_ui->ui_mutex); - host->i_ctime = current_time(host); + inode_set_ctime_current(host); host_ui->xattr_cnt -= 1; host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm)); host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig index 82e8bfa2dfd9..8f7ce30d47fd 100644 --- a/fs/udf/Kconfig +++ b/fs/udf/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config UDF_FS tristate "UDF file system support" + select BUFFER_HEAD select CRC_ITU_T select NLS select LEGACY_DIRECT_IO diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 1c775e072b2f..93153665eb37 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -95,7 +95,7 @@ static int udf_copy_fi(struct udf_fileident_iter *iter) } off = iter->pos & (blksize - 1); - len = min_t(int, sizeof(struct fileIdentDesc), blksize - off); + len = min_t(u32, sizeof(struct fileIdentDesc), blksize - off); memcpy(&iter->fi, iter->bh[0]->b_data + off, len); if (len < sizeof(struct fileIdentDesc)) memcpy((char *)(&iter->fi) + len, iter->bh[1]->b_data, diff --git a/fs/udf/file.c b/fs/udf/file.c index 243840dc83ad..0ceac4b5937c 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -63,13 +63,13 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf) else end = PAGE_SIZE; err = __block_write_begin(page, 0, end, udf_get_block); - if (!err) - err = block_commit_write(page, 0, end); - if (err < 0) { + if (err) { unlock_page(page); - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_unlock; } + + block_commit_write(page, 0, end); out_dirty: set_page_dirty(page); wait_for_stable_page(page); diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 5f7ac8c84798..6b558cbbeb6b 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -100,7 +100,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); iinfo->i_crtime = inode->i_mtime; if (unlikely(insert_inode_locked(inode) < 0)) { make_bad_inode(inode); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 28cdfc57d946..a17a6184cc39 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -352,8 +352,6 @@ int udf_expand_file_adinicb(struct inode *inode) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; - /* from now on we have normal address_space methods */ - inode->i_data.a_ops = &udf_aops; up_write(&iinfo->i_data_sem); mark_inode_dirty(inode); return 0; @@ -910,7 +908,7 @@ static int inode_getblk(struct inode *inode, struct udf_map_rq *map) map->oflags = UDF_BLK_NEW | UDF_BLK_MAPPED; iinfo->i_next_alloc_block = map->lblk + 1; iinfo->i_next_alloc_goal = newblocknum + 1; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (IS_SYNC(inode)) udf_sync_inode(inode); @@ -1298,7 +1296,7 @@ set_size: goto out_unlock; } update_time: - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); if (IS_SYNC(inode)) udf_sync_inode(inode); else @@ -1329,6 +1327,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode) int bs = inode->i_sb->s_blocksize; int ret = -EIO; uint32_t uid, gid; + struct timespec64 ctime; reread: if (iloc->partitionReferenceNum >= sbi->s_partitions) { @@ -1507,7 +1506,8 @@ reread: udf_disk_stamp_to_time(&inode->i_atime, fe->accessTime); udf_disk_stamp_to_time(&inode->i_mtime, fe->modificationTime); - udf_disk_stamp_to_time(&inode->i_ctime, fe->attrTime); + udf_disk_stamp_to_time(&ctime, fe->attrTime); + inode_set_ctime_to_ts(inode, ctime); iinfo->i_unique = le64_to_cpu(fe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr); @@ -1522,7 +1522,8 @@ reread: udf_disk_stamp_to_time(&inode->i_atime, efe->accessTime); udf_disk_stamp_to_time(&inode->i_mtime, efe->modificationTime); udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime); - udf_disk_stamp_to_time(&inode->i_ctime, efe->attrTime); + udf_disk_stamp_to_time(&ctime, efe->attrTime); + inode_set_ctime_to_ts(inode, ctime); iinfo->i_unique = le64_to_cpu(efe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr); @@ -1799,7 +1800,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime); udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime); - udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime); + udf_time_to_disk_stamp(&fe->attrTime, inode_get_ctime(inode)); memset(&(fe->impIdent), 0, sizeof(struct regid)); strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER); fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; @@ -1830,12 +1831,12 @@ static int udf_update_inode(struct inode *inode, int do_sync) udf_adjust_time(iinfo, inode->i_atime); udf_adjust_time(iinfo, inode->i_mtime); - udf_adjust_time(iinfo, inode->i_ctime); + udf_adjust_time(iinfo, inode_get_ctime(inode)); udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime); udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime); udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime); - udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime); + udf_time_to_disk_stamp(&efe->attrTime, inode_get_ctime(inode)); memset(&(efe->impIdent), 0, sizeof(efe->impIdent)); strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index a95579b043ab..ae55ab8859b6 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -365,7 +365,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode) *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse = cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL); udf_fiiter_write_fi(&iter, NULL); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); udf_fiiter_release(&iter); udf_add_fid_counter(dir->i_sb, false, 1); @@ -471,7 +471,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, udf_fiiter_release(&iter); udf_add_fid_counter(dir->i_sb, true, 1); inc_nlink(dir); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); d_instantiate_new(dentry, inode); @@ -523,8 +523,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) inode->i_size = 0; inode_dec_link_count(dir); udf_add_fid_counter(dir->i_sb, true, -1); - inode->i_ctime = dir->i_ctime = dir->i_mtime = - current_time(inode); + dir->i_mtime = inode_set_ctime_to_ts(dir, + inode_set_ctime_current(inode)); mark_inode_dirty(dir); ret = 0; end_rmdir: @@ -555,11 +555,11 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) set_nlink(inode, 1); } udf_fiiter_delete_entry(&iter); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); inode_dec_link_count(inode); udf_add_fid_counter(dir->i_sb, false, -1); - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); ret = 0; end_unlink: udf_fiiter_release(&iter); @@ -746,9 +746,9 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, inc_nlink(inode); udf_add_fid_counter(dir->i_sb, false, 1); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); ihold(inode); d_instantiate(dentry, inode); @@ -833,7 +833,7 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old_inode->i_ctime = current_time(old_inode); + inode_set_ctime_current(old_inode); mark_inode_dirty(old_inode); /* @@ -861,13 +861,13 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, } if (new_inode) { - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); inode_dec_link_count(new_inode); udf_add_fid_counter(old_dir->i_sb, S_ISDIR(new_inode->i_mode), -1); } - old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); - new_dir->i_ctime = new_dir->i_mtime = current_time(new_dir); + old_dir->i_mtime = inode_set_ctime_current(old_dir); + new_dir->i_mtime = inode_set_ctime_current(new_dir); mark_inode_dirty(old_dir); mark_inode_dirty(new_dir); diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index 779b5c2c75f6..f7eaf7b14594 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -149,7 +149,7 @@ static int udf_symlink_getattr(struct mnt_idmap *idmap, struct inode *inode = d_backing_inode(dentry); struct page *page; - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); page = read_mapping_page(inode->i_mapping, 0, NULL); if (IS_ERR(page)) return PTR_ERR(page); diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig index 6d30adb6b890..9301e7ecd092 100644 --- a/fs/ufs/Kconfig +++ b/fs/ufs/Kconfig @@ -2,6 +2,7 @@ config UFS_FS tristate "UFS file system support (read only)" depends on BLOCK + select BUFFER_HEAD help BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD, OpenBSD and NeXTstep) use a file system called UFS. Some System V diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 379d75796a5c..fd57f03b6c93 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -107,7 +107,7 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, ufs_commit_chunk(page, pos, len); ufs_put_page(page); if (update_times) - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); ufs_handle_dirsync(dir); } @@ -397,7 +397,7 @@ got_it: ufs_set_de_type(sb, de, inode->i_mode); ufs_commit_chunk(page, pos, rec_len); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); err = ufs_handle_dirsync(dir); @@ -539,7 +539,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, pde->d_reclen = cpu_to_fs16(sb, to - from); dir->d_ino = 0; ufs_commit_chunk(page, pos, to - from); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); err = ufs_handle_dirsync(inode); out: diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 06bd84d555bd..a1e7bd9d1f98 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -292,7 +292,7 @@ cg_found: inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_blocks = 0; inode->i_generation = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); ufsi->i_flags = UFS_I(dir)->i_flags; ufsi->i_lastfrag = 0; ufsi->i_shadow = 0; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index a4246c83a8cd..21a4779a2de5 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -296,7 +296,7 @@ ufs_inode_getfrag(struct inode *inode, unsigned index, if (new) *new = 1; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (IS_SYNC(inode)) ufs_sync_inode (inode); mark_inode_dirty(inode); @@ -378,7 +378,7 @@ ufs_inode_getblock(struct inode *inode, u64 ind_block, mark_buffer_dirty(bh); if (IS_SYNC(inode)) sync_dirty_buffer(bh); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); out: brelse (bh); @@ -580,11 +580,12 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) inode->i_size = fs64_to_cpu(sb, ufs_inode->ui_size); inode->i_atime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec); - inode->i_ctime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec); + inode_set_ctime(inode, + (signed)fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec), + 0); inode->i_mtime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec); inode->i_mtime.tv_nsec = 0; inode->i_atime.tv_nsec = 0; - inode->i_ctime.tv_nsec = 0; inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks); inode->i_generation = fs32_to_cpu(sb, ufs_inode->ui_gen); ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags); @@ -626,10 +627,10 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode) inode->i_size = fs64_to_cpu(sb, ufs2_inode->ui_size); inode->i_atime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_atime); - inode->i_ctime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_ctime); + inode_set_ctime(inode, fs64_to_cpu(sb, ufs2_inode->ui_ctime), + fs32_to_cpu(sb, ufs2_inode->ui_ctimensec)); inode->i_mtime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_mtime); inode->i_atime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_atimensec); - inode->i_ctime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_ctimensec); inode->i_mtime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_mtimensec); inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks); inode->i_generation = fs32_to_cpu(sb, ufs2_inode->ui_gen); @@ -726,7 +727,8 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode) ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size); ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec); ufs_inode->ui_atime.tv_usec = 0; - ufs_inode->ui_ctime.tv_sec = cpu_to_fs32(sb, inode->i_ctime.tv_sec); + ufs_inode->ui_ctime.tv_sec = cpu_to_fs32(sb, + inode_get_ctime(inode).tv_sec); ufs_inode->ui_ctime.tv_usec = 0; ufs_inode->ui_mtime.tv_sec = cpu_to_fs32(sb, inode->i_mtime.tv_sec); ufs_inode->ui_mtime.tv_usec = 0; @@ -770,8 +772,9 @@ static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode) ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size); ufs_inode->ui_atime = cpu_to_fs64(sb, inode->i_atime.tv_sec); ufs_inode->ui_atimensec = cpu_to_fs32(sb, inode->i_atime.tv_nsec); - ufs_inode->ui_ctime = cpu_to_fs64(sb, inode->i_ctime.tv_sec); - ufs_inode->ui_ctimensec = cpu_to_fs32(sb, inode->i_ctime.tv_nsec); + ufs_inode->ui_ctime = cpu_to_fs64(sb, inode_get_ctime(inode).tv_sec); + ufs_inode->ui_ctimensec = cpu_to_fs32(sb, + inode_get_ctime(inode).tv_nsec); ufs_inode->ui_mtime = cpu_to_fs64(sb, inode->i_mtime.tv_sec); ufs_inode->ui_mtimensec = cpu_to_fs32(sb, inode->i_mtime.tv_nsec); @@ -1205,7 +1208,7 @@ static int ufs_truncate(struct inode *inode, loff_t size) truncate_setsize(inode, size); ufs_truncate_blocks(inode); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); out: UFSD("EXIT: err %d\n", err); diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 36154b5aca6d..9cad29463791 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -153,7 +153,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, struct inode *inode = d_inode(old_dentry); int error; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_link_count(inode); ihold(inode); @@ -220,7 +220,7 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry) if (err) goto out; - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); err = 0; out: @@ -282,7 +282,7 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (!new_de) goto out_dir; ufs_set_link(new_dir, new_de, new_page, old_inode, 1); - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); if (dir_de) drop_nlink(new_inode); inode_dec_link_count(new_inode); @@ -298,7 +298,7 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old_inode->i_ctime = current_time(old_inode); + inode_set_ctime_current(old_inode); ufs_delete_entry(old_dir, old_de, old_page); mark_inode_dirty(old_inode); diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 4931bec1a01c..89247193d96d 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -11,12 +11,6 @@ #include <linux/fs.h> #include "swab.h" - -/* - * some useful macros - */ -#define in_range(b,first,len) ((b)>=(first)&&(b)<(first)+(len)) - /* * functions used for retyping */ diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 7cecd49e078b..56eaae9dac1a 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -277,17 +277,16 @@ static inline struct uffd_msg userfault_msg(unsigned long address, * hugepmd ranges. */ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, - struct vm_area_struct *vma, - unsigned long address, - unsigned long flags, - unsigned long reason) + struct vm_fault *vmf, + unsigned long reason) { + struct vm_area_struct *vma = vmf->vma; pte_t *ptep, pte; bool ret = true; - mmap_assert_locked(ctx->mm); + assert_fault_locked(vmf); - ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma)); + ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma)); if (!ptep) goto out; @@ -308,10 +307,8 @@ out: } #else static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, - struct vm_area_struct *vma, - unsigned long address, - unsigned long flags, - unsigned long reason) + struct vm_fault *vmf, + unsigned long reason) { return false; /* should never get here */ } @@ -325,11 +322,11 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, * threads. */ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, - unsigned long address, - unsigned long flags, + struct vm_fault *vmf, unsigned long reason) { struct mm_struct *mm = ctx->mm; + unsigned long address = vmf->address; pgd_t *pgd; p4d_t *p4d; pud_t *pud; @@ -338,7 +335,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, pte_t ptent; bool ret = true; - mmap_assert_locked(mm); + assert_fault_locked(vmf); pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -427,20 +424,16 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) * * We also don't do userfault handling during * coredumping. hugetlbfs has the special - * follow_hugetlb_page() to skip missing pages in the + * hugetlb_follow_page_mask() to skip missing pages in the * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with * the no_page_table() helper in follow_page_mask(), but the * shmem_vm_ops->fault method is invoked even during - * coredumping without mmap_lock and it ends up here. + * coredumping and it ends up here. */ if (current->flags & (PF_EXITING|PF_DUMPCORE)) goto out; - /* - * Coredumping runs without mmap_lock so we can only check that - * the mmap_lock is held, if PF_DUMPCORE was not set. - */ - mmap_assert_locked(mm); + assert_fault_locked(vmf); ctx = vma->vm_userfaultfd_ctx.ctx; if (!ctx) @@ -556,15 +549,12 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) spin_unlock_irq(&ctx->fault_pending_wqh.lock); if (!is_vm_hugetlb_page(vma)) - must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, - reason); + must_wait = userfaultfd_must_wait(ctx, vmf, reason); else - must_wait = userfaultfd_huge_must_wait(ctx, vma, - vmf->address, - vmf->flags, reason); + must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason); if (is_vm_hugetlb_page(vma)) hugetlb_vma_unlock_read(vma); - mmap_read_unlock(mm); + release_fault_lock(vmf); if (likely(must_wait && !READ_ONCE(ctx->released))) { wake_up_poll(&ctx->fd_wqh, EPOLLIN); @@ -667,6 +657,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, mmap_write_lock(mm); for_each_vma(vmi, vma) { if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { + vma_start_write(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); @@ -702,6 +693,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) octx = vma->vm_userfaultfd_ctx.ctx; if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { + vma_start_write(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); return 0; @@ -783,6 +775,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, atomic_inc(&ctx->mmap_changing); } else { /* Drop uffd context if remap feature not enabled */ + vma_start_write(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); } @@ -940,6 +933,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) prev = vma; } + vma_start_write(vma); userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } @@ -1289,13 +1283,11 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, __wake_userfault(ctx, range); } -static __always_inline int validate_range(struct mm_struct *mm, - __u64 start, __u64 len) +static __always_inline int validate_unaligned_range( + struct mm_struct *mm, __u64 start, __u64 len) { __u64 task_size = mm->task_size; - if (start & ~PAGE_MASK) - return -EINVAL; if (len & ~PAGE_MASK) return -EINVAL; if (!len) @@ -1306,9 +1298,20 @@ static __always_inline int validate_range(struct mm_struct *mm, return -EINVAL; if (len > task_size - start) return -EINVAL; + if (start + len <= start) + return -EINVAL; return 0; } +static __always_inline int validate_range(struct mm_struct *mm, + __u64 start, __u64 len) +{ + if (start & ~PAGE_MASK) + return -EINVAL; + + return validate_unaligned_range(mm, start, len); +} + static int userfaultfd_register(struct userfaultfd_ctx *ctx, unsigned long arg) { @@ -1502,6 +1505,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */ + vma_start_write(vma); userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx.ctx = ctx; @@ -1685,6 +1689,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */ + vma_start_write(vma); userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; @@ -1757,17 +1762,15 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, sizeof(uffdio_copy)-sizeof(__s64))) goto out; + ret = validate_unaligned_range(ctx->mm, uffdio_copy.src, + uffdio_copy.len); + if (ret) + goto out; ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); if (ret) goto out; - /* - * double check for wraparound just in case. copy_from_user() - * will later check uffdio_copy.src + uffdio_copy.len to fit - * in the userland range. - */ + ret = -EINVAL; - if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src) - goto out; if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) goto out; if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP) @@ -1927,11 +1930,6 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) goto out; ret = -EINVAL; - /* double check for wraparound just in case. */ - if (uffdio_continue.range.start + uffdio_continue.range.len <= - uffdio_continue.range.start) { - goto out; - } if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE | UFFDIO_CONTINUE_MODE_WP)) goto out; @@ -1965,6 +1963,61 @@ out: return ret; } +static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg) +{ + __s64 ret; + struct uffdio_poison uffdio_poison; + struct uffdio_poison __user *user_uffdio_poison; + struct userfaultfd_wake_range range; + + user_uffdio_poison = (struct uffdio_poison __user *)arg; + + ret = -EAGAIN; + if (atomic_read(&ctx->mmap_changing)) + goto out; + + ret = -EFAULT; + if (copy_from_user(&uffdio_poison, user_uffdio_poison, + /* don't copy the output fields */ + sizeof(uffdio_poison) - (sizeof(__s64)))) + goto out; + + ret = validate_range(ctx->mm, uffdio_poison.range.start, + uffdio_poison.range.len); + if (ret) + goto out; + + ret = -EINVAL; + if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE) + goto out; + + if (mmget_not_zero(ctx->mm)) { + ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start, + uffdio_poison.range.len, + &ctx->mmap_changing, 0); + mmput(ctx->mm); + } else { + return -ESRCH; + } + + if (unlikely(put_user(ret, &user_uffdio_poison->updated))) + return -EFAULT; + if (ret < 0) + goto out; + + /* len == 0 would wake all */ + BUG_ON(!ret); + range.len = ret; + if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) { + range.start = uffdio_poison.range.start; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN; + +out: + return ret; +} + static inline unsigned int uffd_ctx_features(__u64 user_features) { /* @@ -2066,6 +2119,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, case UFFDIO_CONTINUE: ret = userfaultfd_continue(ctx, arg); break; + case UFFDIO_POISON: + ret = userfaultfd_poison(ctx, arg); + break; } return ret; } diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c index dd0ae1188e87..83f20dd15522 100644 --- a/fs/vboxsf/utils.c +++ b/fs/vboxsf/utils.c @@ -128,8 +128,8 @@ int vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode, inode->i_atime = ns_to_timespec64( info->access_time.ns_relative_to_unix_epoch); - inode->i_ctime = ns_to_timespec64( - info->change_time.ns_relative_to_unix_epoch); + inode_set_ctime_to_ts(inode, + ns_to_timespec64(info->change_time.ns_relative_to_unix_epoch)); inode->i_mtime = ns_to_timespec64( info->modification_time.ns_relative_to_unix_epoch); return 0; @@ -252,7 +252,7 @@ int vboxsf_getattr(struct mnt_idmap *idmap, const struct path *path, if (err) return err; - generic_fillattr(&nop_mnt_idmap, d_inode(dentry), kstat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), kstat); return 0; } diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 49bf3a1eb2a0..d071a6e32581 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -118,16 +118,16 @@ void fsverity_free_info(struct fsverity_info *vi); int fsverity_get_descriptor(struct inode *inode, struct fsverity_descriptor **desc_ret); -int __init fsverity_init_info_cache(void); -void __init fsverity_exit_info_cache(void); +void __init fsverity_init_info_cache(void); /* signature.c */ #ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES +extern int fsverity_require_signatures; int fsverity_verify_signature(const struct fsverity_info *vi, const u8 *signature, size_t sig_size); -int __init fsverity_init_signature(void); +void __init fsverity_init_signature(void); #else /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */ static inline int fsverity_verify_signature(const struct fsverity_info *vi, @@ -136,15 +136,13 @@ fsverity_verify_signature(const struct fsverity_info *vi, return 0; } -static inline int fsverity_init_signature(void) +static inline void fsverity_init_signature(void) { - return 0; } #endif /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */ /* verify.c */ -int __init fsverity_init_workqueue(void); -void __init fsverity_exit_workqueue(void); +void __init fsverity_init_workqueue(void); #endif /* _FSVERITY_PRIVATE_H */ diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c index c598d2035476..6b08b1d9a7d7 100644 --- a/fs/verity/hash_algs.c +++ b/fs/verity/hash_algs.c @@ -226,6 +226,14 @@ void __init fsverity_check_hash_algs(void) if (!alg->name) continue; + /* + * 0 must never be allocated as an FS_VERITY_HASH_ALG_* value, + * as it is reserved for users that use 0 to mean unspecified or + * a default value. fs/verity/ itself doesn't care and doesn't + * have a default algorithm, but some users make use of this. + */ + BUG_ON(i == 0); + BUG_ON(alg->digest_size > FS_VERITY_MAX_DIGEST_SIZE); /* diff --git a/fs/verity/init.c b/fs/verity/init.c index 023905151035..a29f062f6047 100644 --- a/fs/verity/init.c +++ b/fs/verity/init.c @@ -9,6 +9,37 @@ #include <linux/ratelimit.h> +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *fsverity_sysctl_header; + +static struct ctl_table fsverity_sysctl_table[] = { +#ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES + { + .procname = "require_signatures", + .data = &fsverity_require_signatures, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { } +}; + +static void __init fsverity_init_sysctl(void) +{ + fsverity_sysctl_header = register_sysctl("fs/verity", + fsverity_sysctl_table); + if (!fsverity_sysctl_header) + panic("fsverity sysctl registration failed"); +} +#else /* CONFIG_SYSCTL */ +static inline void fsverity_init_sysctl(void) +{ +} +#endif /* !CONFIG_SYSCTL */ + void fsverity_msg(const struct inode *inode, const char *level, const char *fmt, ...) { @@ -33,28 +64,11 @@ void fsverity_msg(const struct inode *inode, const char *level, static int __init fsverity_init(void) { - int err; - fsverity_check_hash_algs(); - - err = fsverity_init_info_cache(); - if (err) - return err; - - err = fsverity_init_workqueue(); - if (err) - goto err_exit_info_cache; - - err = fsverity_init_signature(); - if (err) - goto err_exit_workqueue; - + fsverity_init_info_cache(); + fsverity_init_workqueue(); + fsverity_init_sysctl(); + fsverity_init_signature(); return 0; - -err_exit_workqueue: - fsverity_exit_workqueue(); -err_exit_info_cache: - fsverity_exit_info_cache(); - return err; } late_initcall(fsverity_init) diff --git a/fs/verity/open.c b/fs/verity/open.c index 1db5106a9c38..6c31a871b84b 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -408,18 +408,10 @@ void __fsverity_cleanup_inode(struct inode *inode) } EXPORT_SYMBOL_GPL(__fsverity_cleanup_inode); -int __init fsverity_init_info_cache(void) +void __init fsverity_init_info_cache(void) { - fsverity_info_cachep = KMEM_CACHE_USERCOPY(fsverity_info, - SLAB_RECLAIM_ACCOUNT, - file_digest); - if (!fsverity_info_cachep) - return -ENOMEM; - return 0; -} - -void __init fsverity_exit_info_cache(void) -{ - kmem_cache_destroy(fsverity_info_cachep); - fsverity_info_cachep = NULL; + fsverity_info_cachep = KMEM_CACHE_USERCOPY( + fsverity_info, + SLAB_RECLAIM_ACCOUNT | SLAB_PANIC, + file_digest); } diff --git a/fs/verity/signature.c b/fs/verity/signature.c index 72034bc71c9d..90c07573dd77 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -24,7 +24,7 @@ * /proc/sys/fs/verity/require_signatures * If 1, all verity files must have a valid builtin signature. */ -static int fsverity_require_signatures; +int fsverity_require_signatures; /* * Keyring that contains the trusted X.509 certificates. @@ -62,6 +62,22 @@ int fsverity_verify_signature(const struct fsverity_info *vi, return 0; } + if (fsverity_keyring->keys.nr_leaves_on_tree == 0) { + /* + * The ".fs-verity" keyring is empty, due to builtin signatures + * being supported by the kernel but not actually being used. + * In this case, verify_pkcs7_signature() would always return an + * error, usually ENOKEY. It could also be EBADMSG if the + * PKCS#7 is malformed, but that isn't very important to + * distinguish. So, just skip to ENOKEY to avoid the attack + * surface of the PKCS#7 parser, which would otherwise be + * reachable by any task able to execute FS_IOC_ENABLE_VERITY. + */ + fsverity_err(inode, + "fs-verity keyring is empty, rejecting signed file!"); + return -ENOKEY; + } + d = kzalloc(sizeof(*d) + hash_alg->digest_size, GFP_KERNEL); if (!d) return -ENOMEM; @@ -93,59 +109,14 @@ int fsverity_verify_signature(const struct fsverity_info *vi, return 0; } -#ifdef CONFIG_SYSCTL -static struct ctl_table_header *fsverity_sysctl_header; - -static struct ctl_table fsverity_sysctl_table[] = { - { - .procname = "require_signatures", - .data = &fsverity_require_signatures, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { } -}; - -static int __init fsverity_sysctl_init(void) -{ - fsverity_sysctl_header = register_sysctl("fs/verity", fsverity_sysctl_table); - if (!fsverity_sysctl_header) { - pr_err("sysctl registration failed!\n"); - return -ENOMEM; - } - return 0; -} -#else /* !CONFIG_SYSCTL */ -static inline int __init fsverity_sysctl_init(void) +void __init fsverity_init_signature(void) { - return 0; -} -#endif /* !CONFIG_SYSCTL */ - -int __init fsverity_init_signature(void) -{ - struct key *ring; - int err; - - ring = keyring_alloc(".fs-verity", KUIDT_INIT(0), KGIDT_INIT(0), - current_cred(), KEY_POS_SEARCH | + fsverity_keyring = + keyring_alloc(".fs-verity", KUIDT_INIT(0), KGIDT_INIT(0), + current_cred(), KEY_POS_SEARCH | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_WRITE | KEY_USR_SEARCH | KEY_USR_SETATTR, - KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); - if (IS_ERR(ring)) - return PTR_ERR(ring); - - err = fsverity_sysctl_init(); - if (err) - goto err_put_ring; - - fsverity_keyring = ring; - return 0; - -err_put_ring: - key_put(ring); - return err; + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); + if (IS_ERR(fsverity_keyring)) + panic("failed to allocate \".fs-verity\" keyring"); } diff --git a/fs/verity/verify.c b/fs/verity/verify.c index 433cef51f5f6..904ccd7e8e16 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -346,7 +346,7 @@ void fsverity_enqueue_verify_work(struct work_struct *work) } EXPORT_SYMBOL_GPL(fsverity_enqueue_verify_work); -int __init fsverity_init_workqueue(void) +void __init fsverity_init_workqueue(void) { /* * Use a high-priority workqueue to prioritize verification work, which @@ -360,12 +360,5 @@ int __init fsverity_init_workqueue(void) WQ_HIGHPRI, num_online_cpus()); if (!fsverity_read_workqueue) - return -ENOMEM; - return 0; -} - -void __init fsverity_exit_workqueue(void) -{ - destroy_workqueue(fsverity_read_workqueue); - fsverity_read_workqueue = NULL; + panic("failed to allocate fsverity_read_queue"); } diff --git a/fs/xattr.c b/fs/xattr.c index e7bbb7f57557..efd4736bc94b 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -1040,12 +1040,32 @@ const char *xattr_full_name(const struct xattr_handler *handler, EXPORT_SYMBOL(xattr_full_name); /** - * free_simple_xattr - free an xattr object + * simple_xattr_space - estimate the memory used by a simple xattr + * @name: the full name of the xattr + * @size: the size of its value + * + * This takes no account of how much larger the two slab objects actually are: + * that would depend on the slab implementation, when what is required is a + * deterministic number, which grows with name length and size and quantity. + * + * Return: The approximate number of bytes of memory used by such an xattr. + */ +size_t simple_xattr_space(const char *name, size_t size) +{ + /* + * Use "40" instead of sizeof(struct simple_xattr), to return the + * same result on 32-bit and 64-bit, and even if simple_xattr grows. + */ + return 40 + size + strlen(name); +} + +/** + * simple_xattr_free - free an xattr object * @xattr: the xattr object * * Free the xattr object. Can handle @xattr being NULL. */ -static inline void free_simple_xattr(struct simple_xattr *xattr) +void simple_xattr_free(struct simple_xattr *xattr) { if (xattr) kfree(xattr->name); @@ -1073,7 +1093,7 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) if (len < sizeof(*new_xattr)) return NULL; - new_xattr = kvmalloc(len, GFP_KERNEL); + new_xattr = kvmalloc(len, GFP_KERNEL_ACCOUNT); if (!new_xattr) return NULL; @@ -1164,7 +1184,6 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, * @value: the value to store along the xattr * @size: the size of @value * @flags: the flags determining how to set the xattr - * @removed_size: the size of the removed xattr * * Set a new xattr object. * If @value is passed a new xattr object will be allocated. If XATTR_REPLACE @@ -1181,29 +1200,27 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, * nothing if XATTR_CREATE is specified in @flags or @flags is zero. For * XATTR_REPLACE we fail as mentioned above. * - * Return: On success zero and on error a negative error code is returned. + * Return: On success, the removed or replaced xattr is returned, to be freed + * by the caller; or NULL if none. On failure a negative error code is returned. */ -int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, - const void *value, size_t size, int flags, - ssize_t *removed_size) +struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, + const char *name, const void *value, + size_t size, int flags) { - struct simple_xattr *xattr = NULL, *new_xattr = NULL; + struct simple_xattr *old_xattr = NULL, *new_xattr = NULL; struct rb_node *parent = NULL, **rbp; int err = 0, ret; - if (removed_size) - *removed_size = -1; - /* value == NULL means remove */ if (value) { new_xattr = simple_xattr_alloc(value, size); if (!new_xattr) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - new_xattr->name = kstrdup(name, GFP_KERNEL); + new_xattr->name = kstrdup(name, GFP_KERNEL_ACCOUNT); if (!new_xattr->name) { - free_simple_xattr(new_xattr); - return -ENOMEM; + simple_xattr_free(new_xattr); + return ERR_PTR(-ENOMEM); } } @@ -1217,12 +1234,12 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, else if (ret > 0) rbp = &(*rbp)->rb_right; else - xattr = rb_entry(*rbp, struct simple_xattr, rb_node); - if (xattr) + old_xattr = rb_entry(*rbp, struct simple_xattr, rb_node); + if (old_xattr) break; } - if (xattr) { + if (old_xattr) { /* Fail if XATTR_CREATE is requested and the xattr exists. */ if (flags & XATTR_CREATE) { err = -EEXIST; @@ -1230,12 +1247,10 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, } if (new_xattr) - rb_replace_node(&xattr->rb_node, &new_xattr->rb_node, - &xattrs->rb_root); + rb_replace_node(&old_xattr->rb_node, + &new_xattr->rb_node, &xattrs->rb_root); else - rb_erase(&xattr->rb_node, &xattrs->rb_root); - if (!err && removed_size) - *removed_size = xattr->size; + rb_erase(&old_xattr->rb_node, &xattrs->rb_root); } else { /* Fail if XATTR_REPLACE is requested but no xattr is found. */ if (flags & XATTR_REPLACE) { @@ -1260,12 +1275,10 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, out_unlock: write_unlock(&xattrs->lock); - if (err) - free_simple_xattr(new_xattr); - else - free_simple_xattr(xattr); - return err; - + if (!err) + return old_xattr; + simple_xattr_free(new_xattr); + return ERR_PTR(err); } static bool xattr_is_trusted(const char *name) @@ -1370,14 +1383,17 @@ void simple_xattrs_init(struct simple_xattrs *xattrs) /** * simple_xattrs_free - free xattrs * @xattrs: xattr header whose xattrs to destroy + * @freed_space: approximate number of bytes of memory freed from @xattrs * * Destroy all xattrs in @xattr. When this is called no one can hold a * reference to any of the xattrs anymore. */ -void simple_xattrs_free(struct simple_xattrs *xattrs) +void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space) { struct rb_node *rbp; + if (freed_space) + *freed_space = 0; rbp = rb_first(&xattrs->rb_root); while (rbp) { struct simple_xattr *xattr; @@ -1386,7 +1402,10 @@ void simple_xattrs_free(struct simple_xattrs *xattrs) rbp_next = rb_next(rbp); xattr = rb_entry(rbp, struct simple_xattr, rb_node); rb_erase(&xattr->rb_node, &xattrs->rb_root); - free_simple_xattr(xattr); + if (freed_space) + *freed_space += simple_xattr_space(xattr->name, + xattr->size); + simple_xattr_free(xattr); rbp = rbp_next; } } diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 52e1823241fb..ed0bc8cbc703 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -128,6 +128,7 @@ config XFS_ONLINE_SCRUB bool "XFS online metadata check support" default n depends on XFS_FS + depends on TMPFS && SHMEM select XFS_DRAIN_INTENTS help If you say Y here you will be able to check metadata on a @@ -142,6 +143,23 @@ config XFS_ONLINE_SCRUB If unsure, say N. +config XFS_ONLINE_SCRUB_STATS + bool "XFS online metadata check usage data collection" + default y + depends on XFS_ONLINE_SCRUB + select XFS_DEBUG + help + If you say Y here, the kernel will gather usage data about + the online metadata check subsystem. This includes the number + of invocations, the outcomes, and the results of repairs, if any. + This may slow down scrub slightly due to the use of high precision + timers and the need to merge per-invocation information into the + filesystem counters. + + Usage data are collected in /sys/kernel/debug/xfs/scrub. + + If unsure, say N. + config XFS_ONLINE_REPAIR bool "XFS online metadata repair support" default n diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 16e4eb431230..7762c01a85cf 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -164,15 +164,24 @@ xfs-y += $(addprefix scrub/, \ rmap.o \ scrub.o \ symlink.o \ + xfarray.o \ + xfile.o \ + ) + +xfs-$(CONFIG_XFS_ONLINE_SCRUB_STATS) += scrub/stats.o + +xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ + rtbitmap.o \ + rtsummary.o \ ) -xfs-$(CONFIG_XFS_RT) += scrub/rtbitmap.o xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o # online repair ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) xfs-y += $(addprefix scrub/, \ agheader_repair.o \ + reap.o \ repair.o \ ) endif diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 2cbf9ea39b8c..6360073865db 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -743,7 +743,11 @@ struct xfs_scrub_metadata { */ #define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1u << 7) -#define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR) +/* i: Rebuild the data structure. */ +#define XFS_SCRUB_IFLAG_FORCE_REBUILD (1u << 8) + +#define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR | \ + XFS_SCRUB_IFLAG_FORCE_REBUILD) #define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \ XFS_SCRUB_OFLAG_PREEN | \ XFS_SCRUB_OFLAG_XFAIL | \ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 758aacd8166b..a35781577cad 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -222,7 +222,8 @@ xfs_inode_from_disk( */ inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime); inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime); - inode->i_ctime = xfs_inode_from_disk_ts(from, from->di_ctime); + inode_set_ctime_to_ts(inode, + xfs_inode_from_disk_ts(from, from->di_ctime)); ip->i_disk_size = be64_to_cpu(from->di_size); ip->i_nblocks = be64_to_cpu(from->di_nblocks); @@ -316,7 +317,7 @@ xfs_inode_to_disk( to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime); to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime); - to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime); + to->di_ctime = xfs_inode_to_disk_ts(ip, inode_get_ctime(inode)); to->di_nlink = cpu_to_be32(inode->i_nlink); to->di_gen = cpu_to_be32(inode->i_generation); to->di_mode = cpu_to_be16(inode->i_mode); diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 2420865f3007..a5100a11faf9 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -131,4 +131,26 @@ void xlog_check_buf_cancel_table(struct xlog *log); #define xlog_check_buf_cancel_table(log) do { } while (0) #endif +/* + * Transform a regular reservation into one suitable for recovery of a log + * intent item. + * + * Intent recovery only runs a single step of the transaction chain and defers + * the rest to a separate transaction. Therefore, we reduce logcount to 1 here + * to avoid livelocks if the log grant space is nearly exhausted due to the + * recovered intent pinning the tail. Keep the same logflags to avoid tripping + * asserts elsewhere. Struct copies abound below. + */ +static inline struct xfs_trans_res +xlog_recover_resv(const struct xfs_trans_res *r) +{ + struct xfs_trans_res ret = { + .tr_logres = r->tr_logres, + .tr_logcount = 1, + .tr_logflags = r->tr_logflags, + }; + + return ret; +} + #endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 5e174685a77c..6264daaab37b 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -266,7 +266,8 @@ xfs_validate_sb_write( return -EFSCORRUPTED; } - if (xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { + if (!xfs_is_readonly(mp) && + xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { xfs_alert(mp, "Corruption detected in superblock read-only compatible features (0x%x)!", (sbp->sb_features_ro_compat & diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index cb4796b6e693..6b2296ff248a 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -67,7 +67,7 @@ xfs_trans_ichgtime( if (flags & XFS_ICHGTIME_MOD) inode->i_mtime = tv; if (flags & XFS_ICHGTIME_CHG) - inode->i_ctime = tv; + inode_set_ctime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_CREATE) ip->i_crtime = tv; } diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index bbaa65422c4f..876a2f41b063 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -26,6 +26,7 @@ #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" +#include "scrub/reap.h" /* Superblock */ @@ -48,6 +49,10 @@ xrep_superblock( if (error) return error; + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + /* Copy AG 0's superblock to this one. */ xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); xfs_sb_to_disk(bp->b_addr, &mp->m_sb); @@ -423,6 +428,10 @@ xrep_agf( if (error) return error; + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + /* Start rewriting the header and implant the btrees we found. */ xrep_agf_init_header(sc, agf_bp, &old_agf); xrep_agf_set_roots(sc, agf, fab); @@ -444,13 +453,13 @@ out_revert: struct xrep_agfl { /* Bitmap of alleged AGFL blocks that we're not going to add. */ - struct xbitmap crossed; + struct xagb_bitmap crossed; /* Bitmap of other OWN_AG metadata blocks. */ - struct xbitmap agmetablocks; + struct xagb_bitmap agmetablocks; /* Bitmap of free space. */ - struct xbitmap *freesp; + struct xagb_bitmap *freesp; /* rmapbt cursor for finding crosslinked blocks */ struct xfs_btree_cur *rmap_cur; @@ -466,7 +475,6 @@ xrep_agfl_walk_rmap( void *priv) { struct xrep_agfl *ra = priv; - xfs_fsblock_t fsb; int error = 0; if (xchk_should_terminate(ra->sc, &error)) @@ -474,14 +482,13 @@ xrep_agfl_walk_rmap( /* Record all the OWN_AG blocks. */ if (rec->rm_owner == XFS_RMAP_OWN_AG) { - fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, - rec->rm_startblock); - error = xbitmap_set(ra->freesp, fsb, rec->rm_blockcount); + error = xagb_bitmap_set(ra->freesp, rec->rm_startblock, + rec->rm_blockcount); if (error) return error; } - return xbitmap_set_btcur_path(&ra->agmetablocks, cur); + return xagb_bitmap_set_btcur_path(&ra->agmetablocks, cur); } /* Strike out the blocks that are cross-linked according to the rmapbt. */ @@ -492,12 +499,10 @@ xrep_agfl_check_extent( void *priv) { struct xrep_agfl *ra = priv; - xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(ra->sc->mp, start); + xfs_agblock_t agbno = start; xfs_agblock_t last_agbno = agbno + len - 1; int error; - ASSERT(XFS_FSB_TO_AGNO(ra->sc->mp, start) == ra->sc->sa.pag->pag_agno); - while (agbno <= last_agbno) { bool other_owners; @@ -507,7 +512,7 @@ xrep_agfl_check_extent( return error; if (other_owners) { - error = xbitmap_set(&ra->crossed, agbno, 1); + error = xagb_bitmap_set(&ra->crossed, agbno, 1); if (error) return error; } @@ -533,7 +538,7 @@ STATIC int xrep_agfl_collect_blocks( struct xfs_scrub *sc, struct xfs_buf *agf_bp, - struct xbitmap *agfl_extents, + struct xagb_bitmap *agfl_extents, xfs_agblock_t *flcount) { struct xrep_agfl ra; @@ -543,8 +548,8 @@ xrep_agfl_collect_blocks( ra.sc = sc; ra.freesp = agfl_extents; - xbitmap_init(&ra.agmetablocks); - xbitmap_init(&ra.crossed); + xagb_bitmap_init(&ra.agmetablocks); + xagb_bitmap_init(&ra.crossed); /* Find all space used by the free space btrees & rmapbt. */ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); @@ -556,7 +561,7 @@ xrep_agfl_collect_blocks( /* Find all blocks currently being used by the bnobt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag, XFS_BTNUM_BNO); - error = xbitmap_set_btblocks(&ra.agmetablocks, cur); + error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur); xfs_btree_del_cursor(cur, error); if (error) goto out_bmp; @@ -564,7 +569,7 @@ xrep_agfl_collect_blocks( /* Find all blocks currently being used by the cntbt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag, XFS_BTNUM_CNT); - error = xbitmap_set_btblocks(&ra.agmetablocks, cur); + error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur); xfs_btree_del_cursor(cur, error); if (error) goto out_bmp; @@ -573,17 +578,17 @@ xrep_agfl_collect_blocks( * Drop the freesp meta blocks that are in use by btrees. * The remaining blocks /should/ be AGFL blocks. */ - error = xbitmap_disunion(agfl_extents, &ra.agmetablocks); + error = xagb_bitmap_disunion(agfl_extents, &ra.agmetablocks); if (error) goto out_bmp; /* Strike out the blocks that are cross-linked. */ ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); - error = xbitmap_walk(agfl_extents, xrep_agfl_check_extent, &ra); + error = xagb_bitmap_walk(agfl_extents, xrep_agfl_check_extent, &ra); xfs_btree_del_cursor(ra.rmap_cur, error); if (error) goto out_bmp; - error = xbitmap_disunion(agfl_extents, &ra.crossed); + error = xagb_bitmap_disunion(agfl_extents, &ra.crossed); if (error) goto out_bmp; @@ -591,12 +596,12 @@ xrep_agfl_collect_blocks( * Calculate the new AGFL size. If we found more blocks than fit in * the AGFL we'll free them later. */ - *flcount = min_t(uint64_t, xbitmap_hweight(agfl_extents), + *flcount = min_t(uint64_t, xagb_bitmap_hweight(agfl_extents), xfs_agfl_size(mp)); out_bmp: - xbitmap_destroy(&ra.crossed); - xbitmap_destroy(&ra.agmetablocks); + xagb_bitmap_destroy(&ra.crossed); + xagb_bitmap_destroy(&ra.agmetablocks); return error; } @@ -615,18 +620,24 @@ xrep_agfl_update_agf( xfs_force_summary_recalc(sc->mp); /* Update the AGF counters. */ - if (xfs_perag_initialised_agf(sc->sa.pag)) + if (xfs_perag_initialised_agf(sc->sa.pag)) { sc->sa.pag->pagf_flcount = flcount; + clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, + &sc->sa.pag->pag_opstate); + } agf->agf_flfirst = cpu_to_be32(0); agf->agf_flcount = cpu_to_be32(flcount); - agf->agf_fllast = cpu_to_be32(flcount - 1); + if (flcount) + agf->agf_fllast = cpu_to_be32(flcount - 1); + else + agf->agf_fllast = cpu_to_be32(xfs_agfl_size(sc->mp) - 1); xfs_alloc_log_agf(sc->tp, agf_bp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST | XFS_AGF_FLCOUNT); } struct xrep_agfl_fill { - struct xbitmap used_extents; + struct xagb_bitmap used_extents; struct xfs_scrub *sc; __be32 *agfl_bno; xfs_agblock_t flcount; @@ -642,17 +653,15 @@ xrep_agfl_fill( { struct xrep_agfl_fill *af = priv; struct xfs_scrub *sc = af->sc; - xfs_fsblock_t fsbno = start; + xfs_agblock_t agbno = start; int error; - while (fsbno < start + len && af->fl_off < af->flcount) - af->agfl_bno[af->fl_off++] = - cpu_to_be32(XFS_FSB_TO_AGBNO(sc->mp, fsbno++)); + trace_xrep_agfl_insert(sc->sa.pag, agbno, len); - trace_xrep_agfl_insert(sc->mp, sc->sa.pag->pag_agno, - XFS_FSB_TO_AGBNO(sc->mp, start), len); + while (agbno < start + len && af->fl_off < af->flcount) + af->agfl_bno[af->fl_off++] = cpu_to_be32(agbno++); - error = xbitmap_set(&af->used_extents, start, fsbno - 1); + error = xagb_bitmap_set(&af->used_extents, start, agbno - 1); if (error) return error; @@ -667,7 +676,7 @@ STATIC int xrep_agfl_init_header( struct xfs_scrub *sc, struct xfs_buf *agfl_bp, - struct xbitmap *agfl_extents, + struct xagb_bitmap *agfl_extents, xfs_agblock_t flcount) { struct xrep_agfl_fill af = { @@ -695,17 +704,17 @@ xrep_agfl_init_header( * blocks than fit in the AGFL, they will be freed in a subsequent * step. */ - xbitmap_init(&af.used_extents); + xagb_bitmap_init(&af.used_extents); af.agfl_bno = xfs_buf_to_agfl_bno(agfl_bp), - xbitmap_walk(agfl_extents, xrep_agfl_fill, &af); - error = xbitmap_disunion(agfl_extents, &af.used_extents); + xagb_bitmap_walk(agfl_extents, xrep_agfl_fill, &af); + error = xagb_bitmap_disunion(agfl_extents, &af.used_extents); if (error) return error; /* Write new AGFL to disk. */ xfs_trans_buf_set_type(sc->tp, agfl_bp, XFS_BLFT_AGFL_BUF); xfs_trans_log_buf(sc->tp, agfl_bp, 0, BBTOB(agfl_bp->b_length) - 1); - xbitmap_destroy(&af.used_extents); + xagb_bitmap_destroy(&af.used_extents); return 0; } @@ -714,7 +723,7 @@ int xrep_agfl( struct xfs_scrub *sc) { - struct xbitmap agfl_extents; + struct xagb_bitmap agfl_extents; struct xfs_mount *mp = sc->mp; struct xfs_buf *agf_bp; struct xfs_buf *agfl_bp; @@ -725,7 +734,7 @@ xrep_agfl( if (!xfs_has_rmapbt(mp)) return -EOPNOTSUPP; - xbitmap_init(&agfl_extents); + xagb_bitmap_init(&agfl_extents); /* * Read the AGF so that we can query the rmapbt. We hope that there's @@ -753,6 +762,10 @@ xrep_agfl( if (error) goto err; + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err; + /* * Update AGF and AGFL. We reset the global free block counter when * we adjust the AGF flcount (which can fail) so avoid updating any @@ -774,10 +787,10 @@ xrep_agfl( goto err; /* Dump any AGFL overflow. */ - error = xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG, + error = xrep_reap_agblocks(sc, &agfl_extents, &XFS_RMAP_OINFO_AG, XFS_AG_RESV_AGFL); err: - xbitmap_destroy(&agfl_extents); + xagb_bitmap_destroy(&agfl_extents); return error; } @@ -1000,6 +1013,10 @@ xrep_agi( if (error) return error; + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + /* Start rewriting the header and implant the btrees we found. */ xrep_agi_init_header(sc, agi_bp, &old_agi); xrep_agi_set_roots(sc, agi, fab); diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index 0c959be396ea..e0c89a9a0ca0 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -301,21 +301,15 @@ xagb_bitmap_set_btblocks( * blocks going from the leaf towards the root. */ int -xbitmap_set_btcur_path( - struct xbitmap *bitmap, +xagb_bitmap_set_btcur_path( + struct xagb_bitmap *bitmap, struct xfs_btree_cur *cur) { - struct xfs_buf *bp; - xfs_fsblock_t fsb; int i; int error; for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) { - xfs_btree_get_block(cur, i, &bp); - if (!bp) - continue; - fsb = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); - error = xbitmap_set(bitmap, fsb, 1); + error = xagb_bitmap_visit_btblock(cur, i, bitmap); if (error) return error; } @@ -323,35 +317,6 @@ xbitmap_set_btcur_path( return 0; } -/* Collect a btree's block in the bitmap. */ -STATIC int -xbitmap_collect_btblock( - struct xfs_btree_cur *cur, - int level, - void *priv) -{ - struct xbitmap *bitmap = priv; - struct xfs_buf *bp; - xfs_fsblock_t fsbno; - - xfs_btree_get_block(cur, level, &bp); - if (!bp) - return 0; - - fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); - return xbitmap_set(bitmap, fsbno, 1); -} - -/* Walk the btree and mark the bitmap wherever a btree block is found. */ -int -xbitmap_set_btblocks( - struct xbitmap *bitmap, - struct xfs_btree_cur *cur) -{ - return xfs_btree_visit_blocks(cur, xbitmap_collect_btblock, - XFS_BTREE_VISIT_ALL, bitmap); -} - /* How many bits are set in this bitmap? */ uint64_t xbitmap_hweight( @@ -385,43 +350,6 @@ xbitmap_walk( return error; } -struct xbitmap_walk_bits { - xbitmap_walk_bits_fn fn; - void *priv; -}; - -/* Walk all the bits in a run. */ -static int -xbitmap_walk_bits_in_run( - uint64_t start, - uint64_t len, - void *priv) -{ - struct xbitmap_walk_bits *wb = priv; - uint64_t i; - int error = 0; - - for (i = start; i < start + len; i++) { - error = wb->fn(i, wb->priv); - if (error) - break; - } - - return error; -} - -/* Call a function for every set bit in this bitmap. */ -int -xbitmap_walk_bits( - struct xbitmap *bitmap, - xbitmap_walk_bits_fn fn, - void *priv) -{ - struct xbitmap_walk_bits wb = {.fn = fn, .priv = priv}; - - return xbitmap_walk(bitmap, xbitmap_walk_bits_in_run, &wb); -} - /* Does this bitmap have no bits set at all? */ bool xbitmap_empty( diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h index 84981724ecaf..4fe58bad6734 100644 --- a/fs/xfs/scrub/bitmap.h +++ b/fs/xfs/scrub/bitmap.h @@ -16,10 +16,6 @@ void xbitmap_destroy(struct xbitmap *bitmap); int xbitmap_clear(struct xbitmap *bitmap, uint64_t start, uint64_t len); int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len); int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub); -int xbitmap_set_btcur_path(struct xbitmap *bitmap, - struct xfs_btree_cur *cur); -int xbitmap_set_btblocks(struct xbitmap *bitmap, - struct xfs_btree_cur *cur); uint64_t xbitmap_hweight(struct xbitmap *bitmap); /* @@ -33,10 +29,6 @@ typedef int (*xbitmap_walk_fn)(uint64_t start, uint64_t len, void *priv); int xbitmap_walk(struct xbitmap *bitmap, xbitmap_walk_fn fn, void *priv); -typedef int (*xbitmap_walk_bits_fn)(uint64_t bit, void *priv); -int xbitmap_walk_bits(struct xbitmap *bitmap, xbitmap_walk_bits_fn fn, - void *priv); - bool xbitmap_empty(struct xbitmap *bitmap); bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len); @@ -110,5 +102,7 @@ static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap, int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap, struct xfs_btree_cur *cur); +int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur); #endif /* __XFS_SCRUB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 5bf4326e9783..75588915572e 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -38,8 +38,7 @@ xchk_setup_inode_bmap( if (error) goto out; - sc->ilock_flags = XFS_IOLOCK_EXCL; - xfs_ilock(sc->ip, XFS_IOLOCK_EXCL); + xchk_ilock(sc, XFS_IOLOCK_EXCL); /* * We don't want any ephemeral data/cow fork updates sitting around @@ -50,8 +49,7 @@ xchk_setup_inode_bmap( sc->sm->sm_type != XFS_SCRUB_TYPE_BMBTA) { struct address_space *mapping = VFS_I(sc->ip)->i_mapping; - sc->ilock_flags |= XFS_MMAPLOCK_EXCL; - xfs_ilock(sc->ip, XFS_MMAPLOCK_EXCL); + xchk_ilock(sc, XFS_MMAPLOCK_EXCL); inode_dio_wait(VFS_I(sc->ip)); @@ -79,9 +77,8 @@ xchk_setup_inode_bmap( error = xchk_trans_alloc(sc, 0); if (error) goto out; - sc->ilock_flags |= XFS_ILOCK_EXCL; - xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + xchk_ilock(sc, XFS_ILOCK_EXCL); out: /* scrub teardown will unlock and release the inode */ return error; @@ -844,7 +841,7 @@ xchk_bmap( /* Non-existent forks can be ignored. */ if (!ifp) - goto out; + return -ENOENT; info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip); info.whichfork = whichfork; @@ -853,10 +850,10 @@ xchk_bmap( switch (whichfork) { case XFS_COW_FORK: - /* No CoW forks on non-reflink inodes/filesystems. */ - if (!xfs_is_reflink_inode(ip)) { + /* No CoW forks on non-reflink filesystems. */ + if (!xfs_has_reflink(mp)) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); - goto out; + return 0; } break; case XFS_ATTR_FORK: @@ -876,31 +873,31 @@ xchk_bmap( /* No mappings to check. */ if (whichfork == XFS_COW_FORK) xchk_fblock_set_corrupt(sc, whichfork, 0); - goto out; + return 0; case XFS_DINODE_FMT_EXTENTS: break; case XFS_DINODE_FMT_BTREE: if (whichfork == XFS_COW_FORK) { xchk_fblock_set_corrupt(sc, whichfork, 0); - goto out; + return 0; } error = xchk_bmap_btree(sc, whichfork, &info); if (error) - goto out; + return error; break; default: xchk_fblock_set_corrupt(sc, whichfork, 0); - goto out; + return 0; } if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - goto out; + return 0; /* Find the offset of the last extent in the mapping. */ error = xfs_bmap_last_offset(ip, &endoff, whichfork); if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) - goto out; + return error; /* * Scrub extent records. We use a special iterator function here that @@ -913,12 +910,12 @@ xchk_bmap( while (xchk_bmap_iext_iter(&info, &irec)) { if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) - goto out; + return 0; if (irec.br_startoff >= endoff) { xchk_fblock_set_corrupt(sc, whichfork, irec.br_startoff); - goto out; + return 0; } if (isnullstartblock(irec.br_startblock)) @@ -931,10 +928,10 @@ xchk_bmap( if (xchk_bmap_want_check_rmaps(&info)) { error = xchk_bmap_check_rmaps(sc, whichfork); if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error)) - goto out; + return error; } -out: - return error; + + return 0; } /* Scrub an inode's data fork. */ @@ -958,8 +955,5 @@ int xchk_bmap_cow( struct xfs_scrub *sc) { - if (!xfs_is_reflink_inode(sc->ip)) - return -ENOENT; - return xchk_bmap(sc, XFS_COW_FORK); } diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 7a20256be969..de24532fe083 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -832,6 +832,25 @@ xchk_install_handle_inode( } /* + * Install an already-referenced inode for scrubbing. Get our own reference to + * the inode to make disposal simpler. The inode must not be in I_FREEING or + * I_WILL_FREE state! + */ +int +xchk_install_live_inode( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + if (!igrab(VFS_I(ip))) { + xchk_ino_set_corrupt(sc, ip->i_ino); + return -EFSCORRUPTED; + } + + sc->ip = ip; + return 0; +} + +/* * In preparation to scrub metadata structures that hang off of an inode, * grab either the inode referenced in the scrub control structure or the * inode passed in. If the inumber does not reference an allocated inode @@ -854,10 +873,8 @@ xchk_iget_for_scrubbing( ASSERT(sc->tp == NULL); /* We want to scan the inode we already had opened. */ - if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) { - sc->ip = ip_in; - return 0; - } + if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) + return xchk_install_live_inode(sc, ip_in); /* Reject internal metadata files and obviously bad inode numbers. */ if (xfs_internal_inum(mp, sc->sm->sm_ino)) @@ -1005,20 +1022,48 @@ xchk_setup_inode_contents( return error; /* Lock the inode so the VFS cannot touch this file. */ - sc->ilock_flags = XFS_IOLOCK_EXCL; - xfs_ilock(sc->ip, sc->ilock_flags); + xchk_ilock(sc, XFS_IOLOCK_EXCL); error = xchk_trans_alloc(sc, resblks); if (error) goto out; - sc->ilock_flags |= XFS_ILOCK_EXCL; - xfs_ilock(sc->ip, XFS_ILOCK_EXCL); - + xchk_ilock(sc, XFS_ILOCK_EXCL); out: /* scrub teardown will unlock and release the inode for us */ return error; } +void +xchk_ilock( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + xfs_ilock(sc->ip, ilock_flags); + sc->ilock_flags |= ilock_flags; +} + +bool +xchk_ilock_nowait( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + if (xfs_ilock_nowait(sc->ip, ilock_flags)) { + sc->ilock_flags |= ilock_flags; + return true; + } + + return false; +} + +void +xchk_iunlock( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + sc->ilock_flags &= ~ilock_flags; + xfs_iunlock(sc->ip, ilock_flags); +} + /* * Predicate that decides if we need to evaluate the cross-reference check. * If there was an error accessing the cross-reference btree, just delete @@ -1185,3 +1230,155 @@ xchk_fsgates_enable( sc->flags |= scrub_fsgates; } + +/* + * Decide if this is this a cached inode that's also allocated. The caller + * must hold a reference to an AG and the AGI buffer lock to prevent inodes + * from being allocated or freed. + * + * Look up an inode by number in the given file system. If the inode number + * is invalid, return -EINVAL. If the inode is not in cache, return -ENODATA. + * If the inode is being reclaimed, return -ENODATA because we know the inode + * cache cannot be updating the ondisk metadata. + * + * Otherwise, the incore inode is the one we want, and it is either live, + * somewhere in the inactivation machinery, or reclaimable. The inode is + * allocated if i_mode is nonzero. In all three cases, the cached inode will + * be more up to date than the ondisk inode buffer, so we must use the incore + * i_mode. + */ +int +xchk_inode_is_allocated( + struct xfs_scrub *sc, + xfs_agino_t agino, + bool *inuse) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_perag *pag = sc->sa.pag; + xfs_ino_t ino; + struct xfs_inode *ip; + int error; + + /* caller must hold perag reference */ + if (pag == NULL) { + ASSERT(pag != NULL); + return -EINVAL; + } + + /* caller must have AGI buffer */ + if (sc->sa.agi_bp == NULL) { + ASSERT(sc->sa.agi_bp != NULL); + return -EINVAL; + } + + /* reject inode numbers outside existing AGs */ + ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino); + if (!xfs_verify_ino(mp, ino)) + return -EINVAL; + + error = -ENODATA; + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, agino); + if (!ip) { + /* cache miss */ + goto out_rcu; + } + + /* + * If the inode number doesn't match, the incore inode got reused + * during an RCU grace period and the radix tree hasn't been updated. + * This isn't the inode we want. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != ino) + goto out_skip; + + trace_xchk_inode_is_allocated(ip); + + /* + * We have an incore inode that matches the inode we want, and the + * caller holds the perag structure and the AGI buffer. Let's check + * our assumptions below: + */ + +#ifdef DEBUG + /* + * (1) If the incore inode is live (i.e. referenced from the dcache), + * it will not be INEW, nor will it be in the inactivation or reclaim + * machinery. The ondisk inode had better be allocated. This is the + * most trivial case. + */ + if (!(ip->i_flags & (XFS_NEED_INACTIVE | XFS_INEW | XFS_IRECLAIMABLE | + XFS_INACTIVATING))) { + /* live inode */ + ASSERT(VFS_I(ip)->i_mode != 0); + } + + /* + * If the incore inode is INEW, there are several possibilities: + * + * (2) For a file that is being created, note that we allocate the + * ondisk inode before allocating, initializing, and adding the incore + * inode to the radix tree. + * + * (3) If the incore inode is being recycled, the inode has to be + * allocated because we don't allow freed inodes to be recycled. + * Recycling doesn't touch i_mode. + */ + if (ip->i_flags & XFS_INEW) { + /* created on disk already or recycling */ + ASSERT(VFS_I(ip)->i_mode != 0); + } + + /* + * (4) If the inode is queued for inactivation (NEED_INACTIVE) but + * inactivation has not started (!INACTIVATING), it is still allocated. + */ + if ((ip->i_flags & XFS_NEED_INACTIVE) && + !(ip->i_flags & XFS_INACTIVATING)) { + /* definitely before difree */ + ASSERT(VFS_I(ip)->i_mode != 0); + } +#endif + + /* + * If the incore inode is undergoing inactivation (INACTIVATING), there + * are two possibilities: + * + * (5) It is before the point where it would get freed ondisk, in which + * case i_mode is still nonzero. + * + * (6) It has already been freed, in which case i_mode is zero. + * + * We don't take the ILOCK here, but difree and dialloc update the AGI, + * and we've taken the AGI buffer lock, which prevents that from + * happening. + */ + + /* + * (7) Inodes undergoing inactivation (INACTIVATING) or queued for + * reclaim (IRECLAIMABLE) could be allocated or free. i_mode still + * reflects the ondisk state. + */ + + /* + * (8) If the inode is in IFLUSHING, it's safe to query i_mode because + * the flush code uses i_mode to format the ondisk inode. + */ + + /* + * (9) If the inode is in IRECLAIM and was reachable via the radix + * tree, it still has the same i_mode as it did before it entered + * reclaim. The inode object is still alive because we hold the RCU + * read lock. + */ + + *inuse = VFS_I(ip)->i_mode != 0; + error = 0; + +out_skip: + spin_unlock(&ip->i_flags_lock); +out_rcu: + rcu_read_unlock(); + return error; +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 791235cd9b00..cabdc0e16838 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -88,10 +88,16 @@ int xchk_setup_xattr(struct xfs_scrub *sc); int xchk_setup_symlink(struct xfs_scrub *sc); int xchk_setup_parent(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT -int xchk_setup_rt(struct xfs_scrub *sc); +int xchk_setup_rtbitmap(struct xfs_scrub *sc); +int xchk_setup_rtsummary(struct xfs_scrub *sc); #else static inline int -xchk_setup_rt(struct xfs_scrub *sc) +xchk_setup_rtbitmap(struct xfs_scrub *sc) +{ + return -ENOENT; +} +static inline int +xchk_setup_rtsummary(struct xfs_scrub *sc) { return -ENOENT; } @@ -137,6 +143,12 @@ int xchk_count_rmap_ownedby_ag(struct xfs_scrub *sc, struct xfs_btree_cur *cur, int xchk_setup_ag_btree(struct xfs_scrub *sc, bool force_log); int xchk_iget_for_scrubbing(struct xfs_scrub *sc); int xchk_setup_inode_contents(struct xfs_scrub *sc, unsigned int resblks); +int xchk_install_live_inode(struct xfs_scrub *sc, struct xfs_inode *ip); + +void xchk_ilock(struct xfs_scrub *sc, unsigned int ilock_flags); +bool xchk_ilock_nowait(struct xfs_scrub *sc, unsigned int ilock_flags); +void xchk_iunlock(struct xfs_scrub *sc, unsigned int ilock_flags); + void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp); int xchk_iget(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp); @@ -155,9 +167,29 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) XFS_SCRUB_OFLAG_XCORRUPT); } +#ifdef CONFIG_XFS_ONLINE_REPAIR +/* Decide if a repair is required. */ +static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm) +{ + return sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT | + XFS_SCRUB_OFLAG_PREEN); +} +#else +# define xchk_needs_repair(sc) (false) +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + int xchk_metadata_inode_forks(struct xfs_scrub *sc); /* + * Helper macros to allocate and format xfile description strings. + * Callers must kfree the pointer returned. + */ +#define xchk_xfile_descr(sc, fmt, ...) \ + kasprintf(XCHK_GFP_FLAGS, "XFS (%s): " fmt, \ + (sc)->mp->m_super->s_id, ##__VA_ARGS__) + +/* * Setting up a hook to wait for intents to drain is costly -- we have to take * the CPU hotplug lock and force an i-cache flush on all CPUs once to set it * up, and again to tear it down. These costs add up quickly, so we only want @@ -171,4 +203,7 @@ static inline bool xchk_need_intent_drain(struct xfs_scrub *sc) void xchk_fsgates_enable(struct xfs_scrub *sc, unsigned int scrub_fshooks); +int xchk_inode_is_allocated(struct xfs_scrub *sc, xfs_agino_t agino, + bool *inuse); + #endif /* __XFS_SCRUB_COMMON_H__ */ diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index e382a35e98d8..05be757668bb 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2019-2023 Oracle. All Rights Reserved. * Author: Darrick J. Wong <djwong@kernel.org> @@ -8,6 +8,8 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_trans_resv.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_mount.h" #include "xfs_alloc.h" #include "xfs_ialloc.h" @@ -16,6 +18,7 @@ #include "xfs_ag.h" #include "xfs_rtalloc.h" #include "xfs_inode.h" +#include "xfs_icache.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -53,6 +56,7 @@ struct xchk_fscounters { uint64_t frextents; unsigned long long icount_min; unsigned long long icount_max; + bool frozen; }; /* @@ -123,6 +127,82 @@ xchk_fscount_warmup( return error; } +static inline int +xchk_fsfreeze( + struct xfs_scrub *sc) +{ + int error; + + error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); + trace_xchk_fsfreeze(sc, error); + return error; +} + +static inline int +xchk_fsthaw( + struct xfs_scrub *sc) +{ + int error; + + /* This should always succeed, we have a kernel freeze */ + error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); + trace_xchk_fsthaw(sc, error); + return error; +} + +/* + * We couldn't stabilize the filesystem long enough to sample all the variables + * that comprise the summary counters and compare them to the percpu counters. + * We need to disable all writer threads, which means taking the first two + * freeze levels to put userspace to sleep, and the third freeze level to + * prevent background threads from starting new transactions. Take one level + * more to prevent other callers from unfreezing the filesystem while we run. + */ +STATIC int +xchk_fscounters_freeze( + struct xfs_scrub *sc) +{ + struct xchk_fscounters *fsc = sc->buf; + int error = 0; + + if (sc->flags & XCHK_HAVE_FREEZE_PROT) { + sc->flags &= ~XCHK_HAVE_FREEZE_PROT; + mnt_drop_write_file(sc->file); + } + + /* Try to grab a kernel freeze. */ + while ((error = xchk_fsfreeze(sc)) == -EBUSY) { + if (xchk_should_terminate(sc, &error)) + return error; + + delay(HZ / 10); + } + if (error) + return error; + + fsc->frozen = true; + return 0; +} + +/* Thaw the filesystem after checking or repairing fscounters. */ +STATIC void +xchk_fscounters_cleanup( + void *buf) +{ + struct xchk_fscounters *fsc = buf; + struct xfs_scrub *sc = fsc->sc; + int error; + + if (!fsc->frozen) + return; + + error = xchk_fsthaw(sc); + if (error) + xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error); + else + fsc->frozen = false; +} + int xchk_setup_fscounters( struct xfs_scrub *sc) @@ -140,6 +220,7 @@ xchk_setup_fscounters( sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS); if (!sc->buf) return -ENOMEM; + sc->buf_cleanup = xchk_fscounters_cleanup; fsc = sc->buf; fsc->sc = sc; @@ -150,7 +231,18 @@ xchk_setup_fscounters( if (error) return error; - return xchk_trans_alloc(sc, 0); + /* + * Pause all writer activity in the filesystem while we're scrubbing to + * reduce the likelihood of background perturbations to the counters + * throwing off our calculations. + */ + if (sc->flags & XCHK_TRY_HARDER) { + error = xchk_fscounters_freeze(sc); + if (error) + return error; + } + + return xfs_trans_alloc_empty(sc->mp, &sc->tp); } /* @@ -290,8 +382,7 @@ retry: if (fsc->ifree > fsc->icount) { if (tries--) goto retry; - xchk_set_incomplete(sc); - return 0; + return -EDEADLOCK; } return 0; @@ -367,6 +458,8 @@ xchk_fscount_count_frextents( * Otherwise, we /might/ have a problem. If the change in the summations is * more than we want to tolerate, the filesystem is probably busy and we should * just send back INCOMPLETE and see if userspace will try again. + * + * If we're repairing then we require an exact match. */ static inline bool xchk_fscount_within_range( @@ -396,21 +489,7 @@ xchk_fscount_within_range( if (expected >= min_value && expected <= max_value) return true; - /* - * If the difference between the two summations is too large, the fs - * might just be busy and so we'll mark the scrub incomplete. Return - * true here so that we don't mark the counter corrupt. - * - * XXX: In the future when userspace can grant scrub permission to - * quiesce the filesystem to solve the outsized variance problem, this - * check should be moved up and the return code changed to signal to - * userspace that we need quiesce permission. - */ - if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) { - xchk_set_incomplete(sc); - return true; - } - + /* Everything else is bad. */ return false; } @@ -422,6 +501,7 @@ xchk_fscounters( struct xfs_mount *mp = sc->mp; struct xchk_fscounters *fsc = sc->buf; int64_t icount, ifree, fdblocks, frextents; + bool try_again = false; int error; /* Snapshot the percpu counters. */ @@ -431,9 +511,26 @@ xchk_fscounters( frextents = percpu_counter_sum(&mp->m_frextents); /* No negative values, please! */ - if (icount < 0 || ifree < 0 || fdblocks < 0 || frextents < 0) + if (icount < 0 || ifree < 0) xchk_set_corrupt(sc); + /* + * If the filesystem is not frozen, the counter summation calls above + * can race with xfs_mod_freecounter, which subtracts a requested space + * reservation from the counter and undoes the subtraction if that made + * the counter go negative. Therefore, it's possible to see negative + * values here, and we should only flag that as a corruption if we + * froze the fs. This is much more likely to happen with frextents + * since there are no reserved pools. + */ + if (fdblocks < 0 || frextents < 0) { + if (!fsc->frozen) + return -EDEADLOCK; + + xchk_set_corrupt(sc); + return 0; + } + /* See if icount is obviously wrong. */ if (icount < fsc->icount_min || icount > fsc->icount_max) xchk_set_corrupt(sc); @@ -447,12 +544,6 @@ xchk_fscounters( xchk_set_corrupt(sc); /* - * XXX: We can't quiesce percpu counter updates, so exit early. - * This can be re-enabled when we gain exclusive freeze functionality. - */ - return 0; - - /* * If ifree exceeds icount by more than the minimum variance then * something's probably wrong with the counters. */ @@ -463,8 +554,6 @@ xchk_fscounters( error = xchk_fscount_aggregate_agcounts(sc, fsc); if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) return error; - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) - return 0; /* Count the free extents counter for rt volumes. */ error = xchk_fscount_count_frextents(sc, fsc); @@ -473,20 +562,45 @@ xchk_fscounters( if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) return 0; - /* Compare the in-core counters with whatever we counted. */ - if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount)) - xchk_set_corrupt(sc); + /* + * Compare the in-core counters with whatever we counted. If the fs is + * frozen, we treat the discrepancy as a corruption because the freeze + * should have stabilized the counter values. Otherwise, we need + * userspace to call us back having granted us freeze permission. + */ + if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, + fsc->icount)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } - if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) - xchk_set_corrupt(sc); + if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, - fsc->fdblocks)) - xchk_set_corrupt(sc); + fsc->fdblocks)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, - fsc->frextents)) - xchk_set_corrupt(sc); + fsc->frextents)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } + + if (try_again) + return -EDEADLOCK; return 0; } diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index d2b2a1cb6533..5e2b09ed6e29 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -226,6 +226,16 @@ xchk_ag_btree_healthy_enough( return true; } + /* + * If we just repaired some AG metadata, sc->sick_mask will reflect all + * the per-AG metadata types that were repaired. Exclude these from + * the filesystem health query because we have not yet updated the + * health status and we want everything to be scanned. + */ + if ((sc->flags & XREP_ALREADY_FIXED) && + type_to_health_flag[sc->sm->sm_type].group == XHG_AG) + mask &= ~sc->sick_mask; + if (xfs_ag_has_sickness(pag, mask)) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL; return false; diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 575f22a02ebe..fb7bbf47ae5d 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -328,8 +328,7 @@ xchk_iallocbt_check_cluster_ifree( goto out; } - error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, fsino, - &ino_inuse); + error = xchk_inode_is_allocated(bs->sc, agino, &ino_inuse); if (error == -ENODATA) { /* Not cached, just read the disk buffer */ freemask_ok = irec_free ^ !!(dip->di_mode); diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 3e1e02e340a6..59d7912fb75f 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -32,15 +32,13 @@ xchk_prepare_iscrub( { int error; - sc->ilock_flags = XFS_IOLOCK_EXCL; - xfs_ilock(sc->ip, sc->ilock_flags); + xchk_ilock(sc, XFS_IOLOCK_EXCL); error = xchk_trans_alloc(sc, 0); if (error) return error; - sc->ilock_flags |= XFS_ILOCK_EXCL; - xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + xchk_ilock(sc, XFS_ILOCK_EXCL); return 0; } @@ -83,7 +81,10 @@ xchk_setup_inode( /* We want to scan the opened inode, so lock it and exit. */ if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) { - sc->ip = ip_in; + error = xchk_install_live_inode(sc, ip_in); + if (error) + return error; + return xchk_prepare_iscrub(sc); } diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 58d5dfb7ea21..e6155d86f791 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -150,8 +150,8 @@ xchk_parent_validate( lock_mode = xchk_parent_ilock_dir(dp); if (!lock_mode) { - xfs_iunlock(sc->ip, XFS_ILOCK_EXCL); - xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + xchk_ilock(sc, XFS_ILOCK_EXCL); error = -EAGAIN; goto out_rele; } diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index e6caa358cbda..5671c8153433 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -59,9 +59,12 @@ xchk_setup_quota( error = xchk_setup_fs(sc); if (error) return error; - sc->ip = xfs_quota_inode(sc->mp, dqtype); - xfs_ilock(sc->ip, XFS_ILOCK_EXCL); - sc->ilock_flags = XFS_ILOCK_EXCL; + + error = xchk_install_live_inode(sc, xfs_quota_inode(sc->mp, dqtype)); + if (error) + return error; + + xchk_ilock(sc, XFS_ILOCK_EXCL); return 0; } @@ -235,13 +238,11 @@ xchk_quota( * data fork we have to drop ILOCK_EXCL to use the regular dquot * functions. */ - xfs_iunlock(sc->ip, sc->ilock_flags); - sc->ilock_flags = 0; + xchk_iunlock(sc, sc->ilock_flags); sqi.sc = sc; sqi.last_id = 0; error = xfs_qm_dqiterate(mp, dqtype, xchk_quota_item, &sqi); - sc->ilock_flags = XFS_ILOCK_EXCL; - xfs_ilock(sc->ip, sc->ilock_flags); + xchk_ilock(sc, XFS_ILOCK_EXCL); if (error == -ECANCELED) error = 0; if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c new file mode 100644 index 000000000000..86a62420e02c --- /dev/null +++ b/fs/xfs/scrub/reap.c @@ -0,0 +1,498 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount_btree.h" +#include "xfs_extent_busy.h" +#include "xfs_ag.h" +#include "xfs_ag_resv.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_bmap.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_attr_remote.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/reap.h" + +/* + * Disposal of Blocks from Old Metadata + * + * Now that we've constructed a new btree to replace the damaged one, we want + * to dispose of the blocks that (we think) the old btree was using. + * Previously, we used the rmapbt to collect the extents (bitmap) with the + * rmap owner corresponding to the tree we rebuilt, collected extents for any + * blocks with the same rmap owner that are owned by another data structure + * (sublist), and subtracted sublist from bitmap. In theory the extents + * remaining in bitmap are the old btree's blocks. + * + * Unfortunately, it's possible that the btree was crosslinked with other + * blocks on disk. The rmap data can tell us if there are multiple owners, so + * if the rmapbt says there is an owner of this block other than @oinfo, then + * the block is crosslinked. Remove the reverse mapping and continue. + * + * If there is one rmap record, we can free the block, which removes the + * reverse mapping but doesn't add the block to the free space. Our repair + * strategy is to hope the other metadata objects crosslinked on this block + * will be rebuilt (atop different blocks), thereby removing all the cross + * links. + * + * If there are no rmap records at all, we also free the block. If the btree + * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't + * supposed to be a rmap record and everything is ok. For other btrees there + * had to have been an rmap entry for the block to have ended up on @bitmap, + * so if it's gone now there's something wrong and the fs will shut down. + * + * Note: If there are multiple rmap records with only the same rmap owner as + * the btree we're trying to rebuild and the block is indeed owned by another + * data structure with the same rmap owner, then the block will be in sublist + * and therefore doesn't need disposal. If there are multiple rmap records + * with only the same rmap owner but the block is not owned by something with + * the same rmap owner, the block will be freed. + * + * The caller is responsible for locking the AG headers for the entire rebuild + * operation so that nothing else can sneak in and change the AG state while + * we're not looking. We must also invalidate any buffers associated with + * @bitmap. + */ + +/* Information about reaping extents after a repair. */ +struct xreap_state { + struct xfs_scrub *sc; + + /* Reverse mapping owner and metadata reservation type. */ + const struct xfs_owner_info *oinfo; + enum xfs_ag_resv_type resv; + + /* If true, roll the transaction before reaping the next extent. */ + bool force_roll; + + /* Number of deferred reaps attached to the current transaction. */ + unsigned int deferred; + + /* Number of invalidated buffers logged to the current transaction. */ + unsigned int invalidated; + + /* Number of deferred reaps queued during the whole reap sequence. */ + unsigned long long total_deferred; +}; + +/* Put a block back on the AGFL. */ +STATIC int +xreap_put_freelist( + struct xfs_scrub *sc, + xfs_agblock_t agbno) +{ + struct xfs_buf *agfl_bp; + int error; + + /* Make sure there's space on the freelist. */ + error = xrep_fix_freelist(sc, true); + if (error) + return error; + + /* + * Since we're "freeing" a lost block onto the AGFL, we have to + * create an rmap for the block prior to merging it or else other + * parts will break. + */ + error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1, + &XFS_RMAP_OINFO_AG); + if (error) + return error; + + /* Put the block on the AGFL. */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); + if (error) + return error; + + error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp, + agfl_bp, agbno, 0); + if (error) + return error; + xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1, + XFS_EXTENT_BUSY_SKIP_DISCARD); + + return 0; +} + +/* Are there any uncommitted reap operations? */ +static inline bool xreap_dirty(const struct xreap_state *rs) +{ + if (rs->force_roll) + return true; + if (rs->deferred) + return true; + if (rs->invalidated) + return true; + if (rs->total_deferred) + return true; + return false; +} + +#define XREAP_MAX_BINVAL (2048) + +/* + * Decide if we want to roll the transaction after reaping an extent. We don't + * want to overrun the transaction reservation, so we prohibit more than + * 128 EFIs per transaction. For the same reason, we limit the number + * of buffer invalidations to 2048. + */ +static inline bool xreap_want_roll(const struct xreap_state *rs) +{ + if (rs->force_roll) + return true; + if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS) + return true; + if (rs->invalidated > XREAP_MAX_BINVAL) + return true; + return false; +} + +static inline void xreap_reset(struct xreap_state *rs) +{ + rs->total_deferred += rs->deferred; + rs->deferred = 0; + rs->invalidated = 0; + rs->force_roll = false; +} + +#define XREAP_MAX_DEFER_CHAIN (2048) + +/* + * Decide if we want to finish the deferred ops that are attached to the scrub + * transaction. We don't want to queue huge chains of deferred ops because + * that can consume a lot of log space and kernel memory. Hence we trigger a + * xfs_defer_finish if there are more than 2048 deferred reap operations or the + * caller did some real work. + */ +static inline bool +xreap_want_defer_finish(const struct xreap_state *rs) +{ + if (rs->force_roll) + return true; + if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN) + return true; + return false; +} + +static inline void xreap_defer_finish_reset(struct xreap_state *rs) +{ + rs->total_deferred = 0; + rs->deferred = 0; + rs->invalidated = 0; + rs->force_roll = false; +} + +/* Try to invalidate the incore buffers for an extent that we're freeing. */ +STATIC void +xreap_agextent_binval( + struct xreap_state *rs, + xfs_agblock_t agbno, + xfs_extlen_t *aglenp) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_mount *mp = sc->mp; + xfs_agnumber_t agno = sc->sa.pag->pag_agno; + xfs_agblock_t agbno_next = agbno + *aglenp; + xfs_agblock_t bno = agbno; + + /* + * Avoid invalidating AG headers and post-EOFS blocks because we never + * own those. + */ + if (!xfs_verify_agbno(pag, agbno) || + !xfs_verify_agbno(pag, agbno_next - 1)) + return; + + /* + * If there are incore buffers for these blocks, invalidate them. We + * assume that the lack of any other known owners means that the buffer + * can be locked without risk of deadlocking. The buffer cache cannot + * detect aliasing, so employ nested loops to scan for incore buffers + * of any plausible size. + */ + while (bno < agbno_next) { + xfs_agblock_t fsbcount; + xfs_agblock_t max_fsbs; + + /* + * Max buffer size is the max remote xattr buffer size, which + * is one fs block larger than 64k. + */ + max_fsbs = min_t(xfs_agblock_t, agbno_next - bno, + xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX)); + + for (fsbcount = 1; fsbcount < max_fsbs; fsbcount++) { + struct xfs_buf *bp = NULL; + xfs_daddr_t daddr; + int error; + + daddr = XFS_AGB_TO_DADDR(mp, agno, bno); + error = xfs_buf_incore(mp->m_ddev_targp, daddr, + XFS_FSB_TO_BB(mp, fsbcount), + XBF_LIVESCAN, &bp); + if (error) + continue; + + xfs_trans_bjoin(sc->tp, bp); + xfs_trans_binval(sc->tp, bp); + rs->invalidated++; + + /* + * Stop invalidating if we've hit the limit; we should + * still have enough reservation left to free however + * far we've gotten. + */ + if (rs->invalidated > XREAP_MAX_BINVAL) { + *aglenp -= agbno_next - bno; + goto out; + } + } + + bno++; + } + +out: + trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp); +} + +/* + * Figure out the longest run of blocks that we can dispose of with a single + * call. Cross-linked blocks should have their reverse mappings removed, but + * single-owner extents can be freed. AGFL blocks can only be put back one at + * a time. + */ +STATIC int +xreap_agextent_select( + struct xreap_state *rs, + xfs_agblock_t agbno, + xfs_agblock_t agbno_next, + bool *crosslinked, + xfs_extlen_t *aglenp) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_btree_cur *cur; + xfs_agblock_t bno = agbno + 1; + xfs_extlen_t len = 1; + int error; + + /* + * Determine if there are any other rmap records covering the first + * block of this extent. If so, the block is crosslinked. + */ + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo, + crosslinked); + if (error) + goto out_cur; + + /* AGFL blocks can only be deal with one at a time. */ + if (rs->resv == XFS_AG_RESV_AGFL) + goto out_found; + + /* + * Figure out how many of the subsequent blocks have the same crosslink + * status. + */ + while (bno < agbno_next) { + bool also_crosslinked; + + error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo, + &also_crosslinked); + if (error) + goto out_cur; + + if (*crosslinked != also_crosslinked) + break; + + len++; + bno++; + } + +out_found: + *aglenp = len; + trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked); +out_cur: + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Dispose of as much of the beginning of this AG extent as possible. The + * number of blocks disposed of will be returned in @aglenp. + */ +STATIC int +xreap_agextent_iter( + struct xreap_state *rs, + xfs_agblock_t agbno, + xfs_extlen_t *aglenp, + bool crosslinked) +{ + struct xfs_scrub *sc = rs->sc; + xfs_fsblock_t fsbno; + int error = 0; + + fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno); + + /* + * If there are other rmappings, this block is cross linked and must + * not be freed. Remove the reverse mapping and move on. Otherwise, + * we were the only owner of the block, so free the extent, which will + * also remove the rmap. + * + * XXX: XFS doesn't support detecting the case where a single block + * metadata structure is crosslinked with a multi-block structure + * because the buffer cache doesn't detect aliasing problems, so we + * can't fix 100% of crosslinking problems (yet). The verifiers will + * blow on writeout, the filesystem will shut down, and the admin gets + * to run xfs_repair. + */ + if (crosslinked) { + trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp); + + rs->force_roll = true; + return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, + *aglenp, rs->oinfo); + } + + trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp); + + /* + * Invalidate as many buffers as we can, starting at agbno. If this + * function sets *aglenp to zero, the transaction is full of logged + * buffer invalidations, so we need to return early so that we can + * roll and retry. + */ + xreap_agextent_binval(rs, agbno, aglenp); + if (*aglenp == 0) { + ASSERT(xreap_want_roll(rs)); + return 0; + } + + /* Put blocks back on the AGFL one at a time. */ + if (rs->resv == XFS_AG_RESV_AGFL) { + ASSERT(*aglenp == 1); + error = xreap_put_freelist(sc, agbno); + if (error) + return error; + + rs->force_roll = true; + return 0; + } + + /* + * Use deferred frees to get rid of the old btree blocks to try to + * minimize the window in which we could crash and lose the old blocks. + */ + error = __xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, + rs->resv, true); + if (error) + return error; + + rs->deferred++; + return 0; +} + +/* + * Break an AG metadata extent into sub-extents by fate (crosslinked, not + * crosslinked), and dispose of each sub-extent separately. + */ +STATIC int +xreap_agmeta_extent( + uint64_t fsbno, + uint64_t len, + void *priv) +{ + struct xreap_state *rs = priv; + struct xfs_scrub *sc = rs->sc; + xfs_agblock_t agbno = fsbno; + xfs_agblock_t agbno_next = agbno + len; + int error = 0; + + ASSERT(len <= XFS_MAX_BMBT_EXTLEN); + ASSERT(sc->ip == NULL); + + while (agbno < agbno_next) { + xfs_extlen_t aglen; + bool crosslinked; + + error = xreap_agextent_select(rs, agbno, agbno_next, + &crosslinked, &aglen); + if (error) + return error; + + error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked); + if (error) + return error; + + if (xreap_want_defer_finish(rs)) { + error = xrep_defer_finish(sc); + if (error) + return error; + xreap_defer_finish_reset(rs); + } else if (xreap_want_roll(rs)) { + error = xrep_roll_ag_trans(sc); + if (error) + return error; + xreap_reset(rs); + } + + agbno += aglen; + } + + return 0; +} + +/* Dispose of every block of every AG metadata extent in the bitmap. */ +int +xrep_reap_agblocks( + struct xfs_scrub *sc, + struct xagb_bitmap *bitmap, + const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type) +{ + struct xreap_state rs = { + .sc = sc, + .oinfo = oinfo, + .resv = type, + }; + int error; + + ASSERT(xfs_has_rmapbt(sc->mp)); + ASSERT(sc->ip == NULL); + + error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs); + if (error) + return error; + + if (xreap_dirty(&rs)) + return xrep_defer_finish(sc); + + return 0; +} diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h new file mode 100644 index 000000000000..fe24626af164 --- /dev/null +++ b/fs/xfs/scrub/reap.h @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_REAP_H__ +#define __XFS_SCRUB_REAP_H__ + +int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap, + const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); + +#endif /* __XFS_SCRUB_REAP_H__ */ diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index ac6d8803e660..1b8b5439f2d7 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -26,11 +26,13 @@ #include "xfs_ag_resv.h" #include "xfs_quota.h" #include "xfs_qm.h" +#include "xfs_defer.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" +#include "scrub/stats.h" /* * Attempt to repair some metadata, if the metadata is corrupt and userspace @@ -39,8 +41,10 @@ */ int xrep_attempt( - struct xfs_scrub *sc) + struct xfs_scrub *sc, + struct xchk_stats_run *run) { + u64 repair_start; int error = 0; trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error); @@ -49,8 +53,11 @@ xrep_attempt( /* Repair whatever's broken. */ ASSERT(sc->ops->repair); + run->repair_attempted = true; + repair_start = xchk_stats_now(); error = sc->ops->repair(sc); trace_xrep_done(XFS_I(file_inode(sc->file)), sc->sm, error); + run->repair_ns += xchk_stats_elapsed_ns(repair_start); switch (error) { case 0: /* @@ -59,14 +66,17 @@ xrep_attempt( */ sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; sc->flags |= XREP_ALREADY_FIXED; + run->repair_succeeded = true; return -EAGAIN; case -ECHRNG: sc->flags |= XCHK_NEED_DRAIN; + run->retries++; return -EAGAIN; case -EDEADLOCK: /* Tell the caller to try again having grabbed all the locks. */ if (!(sc->flags & XCHK_TRY_HARDER)) { sc->flags |= XCHK_TRY_HARDER; + run->retries++; return -EAGAIN; } /* @@ -166,6 +176,56 @@ xrep_roll_ag_trans( return 0; } +/* Finish all deferred work attached to the repair transaction. */ +int +xrep_defer_finish( + struct xfs_scrub *sc) +{ + int error; + + /* + * Keep the AG header buffers locked while we complete deferred work + * items. Ensure that both AG buffers are dirty and held when we roll + * the transaction so that they move forward in the log without losing + * the bli (and hence the bli type) when the transaction commits. + * + * Normal code would never hold clean buffers across a roll, but repair + * needs both buffers to maintain a total lock on the AG. + */ + if (sc->sa.agi_bp) { + xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM); + xfs_trans_bhold(sc->tp, sc->sa.agi_bp); + } + + if (sc->sa.agf_bp) { + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM); + xfs_trans_bhold(sc->tp, sc->sa.agf_bp); + } + + /* + * Finish all deferred work items. We still hold the AG header buffers + * locked regardless of whether or not that succeeds. On failure, the + * buffers will be released during teardown on our way out of the + * kernel. If successful, join the buffers to the new transaction + * and move on. + */ + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + + /* + * Release the hold that we set above because defer_finish won't do + * that for us. The defer roll code redirties held buffers after each + * roll, so the AG header buffers should be ready for logging. + */ + if (sc->sa.agi_bp) + xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp); + if (sc->sa.agf_bp) + xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp); + + return 0; +} + /* * Does the given AG have enough space to rebuild a btree? Neither AG * reservation can be critical, and we must have enough space (factoring @@ -297,89 +357,6 @@ xrep_calc_ag_resblks( return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz)); } -/* Allocate a block in an AG. */ -int -xrep_alloc_ag_block( - struct xfs_scrub *sc, - const struct xfs_owner_info *oinfo, - xfs_fsblock_t *fsbno, - enum xfs_ag_resv_type resv) -{ - struct xfs_alloc_arg args = {0}; - xfs_agblock_t bno; - int error; - - switch (resv) { - case XFS_AG_RESV_AGFL: - case XFS_AG_RESV_RMAPBT: - error = xfs_alloc_get_freelist(sc->sa.pag, sc->tp, - sc->sa.agf_bp, &bno, 1); - if (error) - return error; - if (bno == NULLAGBLOCK) - return -ENOSPC; - xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, 1, false); - *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, bno); - if (resv == XFS_AG_RESV_RMAPBT) - xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.pag->pag_agno); - return 0; - default: - break; - } - - args.tp = sc->tp; - args.mp = sc->mp; - args.pag = sc->sa.pag; - args.oinfo = *oinfo; - args.minlen = 1; - args.maxlen = 1; - args.prod = 1; - args.resv = resv; - - error = xfs_alloc_vextent_this_ag(&args, sc->sa.pag->pag_agno); - if (error) - return error; - if (args.fsbno == NULLFSBLOCK) - return -ENOSPC; - ASSERT(args.len == 1); - *fsbno = args.fsbno; - - return 0; -} - -/* Initialize a new AG btree root block with zero entries. */ -int -xrep_init_btblock( - struct xfs_scrub *sc, - xfs_fsblock_t fsb, - struct xfs_buf **bpp, - xfs_btnum_t btnum, - const struct xfs_buf_ops *ops) -{ - struct xfs_trans *tp = sc->tp; - struct xfs_mount *mp = sc->mp; - struct xfs_buf *bp; - int error; - - trace_xrep_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb), - XFS_FSB_TO_AGBNO(mp, fsb), btnum); - - ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.pag->pag_agno); - error = xfs_trans_get_buf(tp, mp->m_ddev_targp, - XFS_FSB_TO_DADDR(mp, fsb), XFS_FSB_TO_BB(mp, 1), 0, - &bp); - if (error) - return error; - xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.pag->pag_agno); - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF); - xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1); - bp->b_ops = ops; - *bpp = bp; - - return 0; -} - /* * Reconstructing per-AG Btrees * @@ -404,91 +381,8 @@ xrep_init_btblock( * sublist. As with the other btrees we subtract sublist from bitmap, and the * result (since the rmapbt lives in the free space) are the blocks from the * old rmapbt. - * - * Disposal of Blocks from Old per-AG Btrees - * - * Now that we've constructed a new btree to replace the damaged one, we want - * to dispose of the blocks that (we think) the old btree was using. - * Previously, we used the rmapbt to collect the extents (bitmap) with the - * rmap owner corresponding to the tree we rebuilt, collected extents for any - * blocks with the same rmap owner that are owned by another data structure - * (sublist), and subtracted sublist from bitmap. In theory the extents - * remaining in bitmap are the old btree's blocks. - * - * Unfortunately, it's possible that the btree was crosslinked with other - * blocks on disk. The rmap data can tell us if there are multiple owners, so - * if the rmapbt says there is an owner of this block other than @oinfo, then - * the block is crosslinked. Remove the reverse mapping and continue. - * - * If there is one rmap record, we can free the block, which removes the - * reverse mapping but doesn't add the block to the free space. Our repair - * strategy is to hope the other metadata objects crosslinked on this block - * will be rebuilt (atop different blocks), thereby removing all the cross - * links. - * - * If there are no rmap records at all, we also free the block. If the btree - * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't - * supposed to be a rmap record and everything is ok. For other btrees there - * had to have been an rmap entry for the block to have ended up on @bitmap, - * so if it's gone now there's something wrong and the fs will shut down. - * - * Note: If there are multiple rmap records with only the same rmap owner as - * the btree we're trying to rebuild and the block is indeed owned by another - * data structure with the same rmap owner, then the block will be in sublist - * and therefore doesn't need disposal. If there are multiple rmap records - * with only the same rmap owner but the block is not owned by something with - * the same rmap owner, the block will be freed. - * - * The caller is responsible for locking the AG headers for the entire rebuild - * operation so that nothing else can sneak in and change the AG state while - * we're not looking. We also assume that the caller already invalidated any - * buffers associated with @bitmap. */ -static int -xrep_invalidate_block( - uint64_t fsbno, - void *priv) -{ - struct xfs_scrub *sc = priv; - struct xfs_buf *bp; - int error; - - /* Skip AG headers and post-EOFS blocks */ - if (!xfs_verify_fsbno(sc->mp, fsbno)) - return 0; - - error = xfs_buf_incore(sc->mp->m_ddev_targp, - XFS_FSB_TO_DADDR(sc->mp, fsbno), - XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp); - if (error) - return 0; - - xfs_trans_bjoin(sc->tp, bp); - xfs_trans_binval(sc->tp, bp); - return 0; -} - -/* - * Invalidate buffers for per-AG btree blocks we're dumping. This function - * is not intended for use with file data repairs; we have bunmapi for that. - */ -int -xrep_invalidate_blocks( - struct xfs_scrub *sc, - struct xbitmap *bitmap) -{ - /* - * For each block in each extent, see if there's an incore buffer for - * exactly that block; if so, invalidate it. The buffer cache only - * lets us look for one buffer at a time, so we have to look one block - * at a time. Avoid invalidating AG headers and post-EOFS blocks - * because we never own those; and if we can't TRYLOCK the buffer we - * assume it's owned by someone else. - */ - return xbitmap_walk_bits(bitmap, xrep_invalidate_block, sc); -} - /* Ensure the freelist is the correct size. */ int xrep_fix_freelist( @@ -507,155 +401,6 @@ xrep_fix_freelist( can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK); } -/* Information about reaping extents after a repair. */ -struct xrep_reap_state { - struct xfs_scrub *sc; - - /* Reverse mapping owner and metadata reservation type. */ - const struct xfs_owner_info *oinfo; - enum xfs_ag_resv_type resv; -}; - -/* - * Put a block back on the AGFL. - */ -STATIC int -xrep_put_freelist( - struct xfs_scrub *sc, - xfs_agblock_t agbno) -{ - struct xfs_buf *agfl_bp; - int error; - - /* Make sure there's space on the freelist. */ - error = xrep_fix_freelist(sc, true); - if (error) - return error; - - /* - * Since we're "freeing" a lost block onto the AGFL, we have to - * create an rmap for the block prior to merging it or else other - * parts will break. - */ - error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1, - &XFS_RMAP_OINFO_AG); - if (error) - return error; - - /* Put the block on the AGFL. */ - error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); - if (error) - return error; - - error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp, - agfl_bp, agbno, 0); - if (error) - return error; - xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1, - XFS_EXTENT_BUSY_SKIP_DISCARD); - - return 0; -} - -/* Dispose of a single block. */ -STATIC int -xrep_reap_block( - uint64_t fsbno, - void *priv) -{ - struct xrep_reap_state *rs = priv; - struct xfs_scrub *sc = rs->sc; - struct xfs_btree_cur *cur; - struct xfs_buf *agf_bp = NULL; - xfs_agblock_t agbno; - bool has_other_rmap; - int error; - - ASSERT(sc->ip != NULL || - XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno); - trace_xrep_dispose_btree_extent(sc->mp, - XFS_FSB_TO_AGNO(sc->mp, fsbno), - XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1); - - agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); - ASSERT(XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno); - - /* - * If we are repairing per-inode metadata, we need to read in the AGF - * buffer. Otherwise, we're repairing a per-AG structure, so reuse - * the AGF buffer that the setup functions already grabbed. - */ - if (sc->ip) { - error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp); - if (error) - return error; - } else { - agf_bp = sc->sa.agf_bp; - } - cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, sc->sa.pag); - - /* Can we find any other rmappings? */ - error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo, - &has_other_rmap); - xfs_btree_del_cursor(cur, error); - if (error) - goto out_free; - - /* - * If there are other rmappings, this block is cross linked and must - * not be freed. Remove the reverse mapping and move on. Otherwise, - * we were the only owner of the block, so free the extent, which will - * also remove the rmap. - * - * XXX: XFS doesn't support detecting the case where a single block - * metadata structure is crosslinked with a multi-block structure - * because the buffer cache doesn't detect aliasing problems, so we - * can't fix 100% of crosslinking problems (yet). The verifiers will - * blow on writeout, the filesystem will shut down, and the admin gets - * to run xfs_repair. - */ - if (has_other_rmap) - error = xfs_rmap_free(sc->tp, agf_bp, sc->sa.pag, agbno, - 1, rs->oinfo); - else if (rs->resv == XFS_AG_RESV_AGFL) - error = xrep_put_freelist(sc, agbno); - else - error = xfs_free_extent(sc->tp, sc->sa.pag, agbno, 1, rs->oinfo, - rs->resv); - if (agf_bp != sc->sa.agf_bp) - xfs_trans_brelse(sc->tp, agf_bp); - if (error) - return error; - - if (sc->ip) - return xfs_trans_roll_inode(&sc->tp, sc->ip); - return xrep_roll_ag_trans(sc); - -out_free: - if (agf_bp != sc->sa.agf_bp) - xfs_trans_brelse(sc->tp, agf_bp); - return error; -} - -/* Dispose of every block of every extent in the bitmap. */ -int -xrep_reap_extents( - struct xfs_scrub *sc, - struct xbitmap *bitmap, - const struct xfs_owner_info *oinfo, - enum xfs_ag_resv_type type) -{ - struct xrep_reap_state rs = { - .sc = sc, - .oinfo = oinfo, - .resv = type, - }; - - ASSERT(xfs_has_rmapbt(sc->mp)); - - return xbitmap_walk_bits(bitmap, xrep_reap_block, &rs); -} - /* * Finding per-AG Btree Roots for AGF/AGI Reconstruction * diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index dce791c679ee..60d2a9ae5f2e 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -8,6 +8,8 @@ #include "xfs_quota_defs.h" +struct xchk_stats_run; + static inline int xrep_notsupported(struct xfs_scrub *sc) { return -EOPNOTSUPP; @@ -15,28 +17,28 @@ static inline int xrep_notsupported(struct xfs_scrub *sc) #ifdef CONFIG_XFS_ONLINE_REPAIR +/* + * This is the maximum number of deferred extent freeing item extents (EFIs) + * that we'll attach to a transaction without rolling the transaction to avoid + * overrunning a tr_itruncate reservation. + */ +#define XREP_MAX_ITRUNCATE_EFIS (128) + + /* Repair helpers */ -int xrep_attempt(struct xfs_scrub *sc); +int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run); void xrep_failure(struct xfs_mount *mp); int xrep_roll_ag_trans(struct xfs_scrub *sc); +int xrep_defer_finish(struct xfs_scrub *sc); bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks, enum xfs_ag_resv_type type); xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc); -int xrep_alloc_ag_block(struct xfs_scrub *sc, - const struct xfs_owner_info *oinfo, xfs_fsblock_t *fsbno, - enum xfs_ag_resv_type resv); -int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb, - struct xfs_buf **bpp, xfs_btnum_t btnum, - const struct xfs_buf_ops *ops); struct xbitmap; struct xagb_bitmap; int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink); -int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xbitmap *btlist); -int xrep_reap_extents(struct xfs_scrub *sc, struct xbitmap *exlist, - const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); struct xrep_find_ag_btree { /* in: rmap owner of the btree we're looking for */ @@ -70,7 +72,8 @@ int xrep_agi(struct xfs_scrub *sc); static inline int xrep_attempt( - struct xfs_scrub *sc) + struct xfs_scrub *sc, + struct xchk_stats_run *run) { return -EOPNOTSUPP; } diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index e7dace7b4be8..008ddb599e13 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -19,19 +19,20 @@ /* Set us up with the realtime metadata locked. */ int -xchk_setup_rt( +xchk_setup_rtbitmap( struct xfs_scrub *sc) { int error; - error = xchk_setup_fs(sc); + error = xchk_trans_alloc(sc, 0); if (error) return error; - sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP; - sc->ip = sc->mp->m_rbmip; - xfs_ilock(sc->ip, sc->ilock_flags); + error = xchk_install_live_inode(sc, sc->mp->m_rbmip); + if (error) + return error; + xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); return 0; } @@ -123,43 +124,6 @@ out: return error; } -/* Scrub the realtime summary. */ -int -xchk_rtsummary( - struct xfs_scrub *sc) -{ - struct xfs_inode *rsumip = sc->mp->m_rsumip; - struct xfs_inode *old_ip = sc->ip; - uint old_ilock_flags = sc->ilock_flags; - int error = 0; - - /* - * We ILOCK'd the rt bitmap ip in the setup routine, now lock the - * rt summary ip in compliance with the rt inode locking rules. - * - * Since we switch sc->ip to rsumip we have to save the old ilock - * flags so that we don't mix up the inode state that @sc tracks. - */ - sc->ip = rsumip; - sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM; - xfs_ilock(sc->ip, sc->ilock_flags); - - /* Invoke the fork scrubber. */ - error = xchk_metadata_inode_forks(sc); - if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) - goto out; - - /* XXX: implement this some day */ - xchk_set_incomplete(sc); -out: - /* Switch back to the rtbitmap inode and lock flags. */ - xfs_iunlock(sc->ip, sc->ilock_flags); - sc->ilock_flags = old_ilock_flags; - sc->ip = old_ip; - return error; -} - - /* xref check that the extent is not free in the rtbitmap */ void xchk_xref_is_used_rt_space( diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c new file mode 100644 index 000000000000..437ed9acbb27 --- /dev/null +++ b/fs/xfs/scrub/rtsummary.c @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_inode.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_rtalloc.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/xfile.h" + +/* + * Realtime Summary + * ================ + * + * We check the realtime summary by scanning the realtime bitmap file to create + * a new summary file incore, and then we compare the computed version against + * the ondisk version. We use the 'xfile' functionality to store this + * (potentially large) amount of data in pageable memory. + */ + +/* Set us up to check the rtsummary file. */ +int +xchk_setup_rtsummary( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + char *descr; + int error; + + /* + * Create an xfile to construct a new rtsummary file. The xfile allows + * us to avoid pinning kernel memory for this purpose. + */ + descr = xchk_xfile_descr(sc, "realtime summary file"); + error = xfile_create(descr, mp->m_rsumsize, &sc->xfile); + kfree(descr); + if (error) + return error; + + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + + /* Allocate a memory buffer for the summary comparison. */ + sc->buf = kvmalloc(mp->m_sb.sb_blocksize, XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + + error = xchk_install_live_inode(sc, mp->m_rsumip); + if (error) + return error; + + /* + * Locking order requires us to take the rtbitmap first. We must be + * careful to unlock it ourselves when we are done with the rtbitmap + * file since the scrub infrastructure won't do that for us. Only + * then we can lock the rtsummary inode. + */ + xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); + return 0; +} + +/* Helper functions to record suminfo words in an xfile. */ + +typedef unsigned int xchk_rtsumoff_t; + +static inline int +xfsum_load( + struct xfs_scrub *sc, + xchk_rtsumoff_t sumoff, + xfs_suminfo_t *info) +{ + return xfile_obj_load(sc->xfile, info, sizeof(xfs_suminfo_t), + sumoff << XFS_WORDLOG); +} + +static inline int +xfsum_store( + struct xfs_scrub *sc, + xchk_rtsumoff_t sumoff, + const xfs_suminfo_t info) +{ + return xfile_obj_store(sc->xfile, &info, sizeof(xfs_suminfo_t), + sumoff << XFS_WORDLOG); +} + +static inline int +xfsum_copyout( + struct xfs_scrub *sc, + xchk_rtsumoff_t sumoff, + xfs_suminfo_t *info, + unsigned int nr_words) +{ + return xfile_obj_load(sc->xfile, info, nr_words << XFS_WORDLOG, + sumoff << XFS_WORDLOG); +} + +/* Update the summary file to reflect the free extent that we've accumulated. */ +STATIC int +xchk_rtsum_record_free( + struct xfs_mount *mp, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv) +{ + struct xfs_scrub *sc = priv; + xfs_fileoff_t rbmoff; + xfs_rtblock_t rtbno; + xfs_filblks_t rtlen; + xchk_rtsumoff_t offs; + unsigned int lenlog; + xfs_suminfo_t v = 0; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + /* Compute the relevant location in the rtsum file. */ + rbmoff = XFS_BITTOBLOCK(mp, rec->ar_startext); + lenlog = XFS_RTBLOCKLOG(rec->ar_extcount); + offs = XFS_SUMOFFS(mp, lenlog, rbmoff); + + rtbno = rec->ar_startext * mp->m_sb.sb_rextsize; + rtlen = rec->ar_extcount * mp->m_sb.sb_rextsize; + + if (!xfs_verify_rtext(mp, rtbno, rtlen)) { + xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino); + return -EFSCORRUPTED; + } + + /* Bump the summary count. */ + error = xfsum_load(sc, offs, &v); + if (error) + return error; + + v++; + trace_xchk_rtsum_record_free(mp, rec->ar_startext, rec->ar_extcount, + lenlog, offs, v); + + return xfsum_store(sc, offs, v); +} + +/* Compute the realtime summary from the realtime bitmap. */ +STATIC int +xchk_rtsum_compute( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + unsigned long long rtbmp_bytes; + + /* If the bitmap size doesn't match the computed size, bail. */ + rtbmp_bytes = howmany_64(mp->m_sb.sb_rextents, NBBY); + if (roundup_64(rtbmp_bytes, mp->m_sb.sb_blocksize) != + mp->m_rbmip->i_disk_size) + return -EFSCORRUPTED; + + return xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtsum_record_free, + sc); +} + +/* Compare the rtsummary file against the one we computed. */ +STATIC int +xchk_rtsum_compare( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp; + struct xfs_bmbt_irec map; + xfs_fileoff_t off; + xchk_rtsumoff_t sumoff = 0; + int nmap; + + for (off = 0; off < XFS_B_TO_FSB(mp, mp->m_rsumsize); off++) { + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* Make sure we have a written extent. */ + nmap = 1; + error = xfs_bmapi_read(mp->m_rsumip, off, 1, &map, &nmap, + XFS_DATA_FORK); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) + return error; + + if (nmap != 1 || !xfs_bmap_is_written_extent(&map)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off); + return 0; + } + + /* Read a block's worth of ondisk rtsummary file. */ + error = xfs_rtbuf_get(mp, sc->tp, off, 1, &bp); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) + return error; + + /* Read a block's worth of computed rtsummary file. */ + error = xfsum_copyout(sc, sumoff, sc->buf, mp->m_blockwsize); + if (error) { + xfs_trans_brelse(sc->tp, bp); + return error; + } + + if (memcmp(bp->b_addr, sc->buf, + mp->m_blockwsize << XFS_WORDLOG) != 0) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off); + + xfs_trans_brelse(sc->tp, bp); + sumoff += mp->m_blockwsize; + } + + return 0; +} + +/* Scrub the realtime summary. */ +int +xchk_rtsummary( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + int error = 0; + + /* Invoke the fork scrubber. */ + error = xchk_metadata_inode_forks(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + goto out_rbm; + + /* Construct the new summary file from the rtbitmap. */ + error = xchk_rtsum_compute(sc); + if (error == -EFSCORRUPTED) { + /* + * EFSCORRUPTED means the rtbitmap is corrupt, which is an xref + * error since we're checking the summary file. + */ + xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino); + error = 0; + goto out_rbm; + } + if (error) + goto out_rbm; + + /* Does the computed summary file match the actual rtsummary file? */ + error = xchk_rtsum_compare(sc); + +out_rbm: + /* Unlock the rtbitmap since we're done with it. */ + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + return error; +} diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 3d98f604765e..4849efcaa33a 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -22,6 +22,8 @@ #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/health.h" +#include "scrub/stats.h" +#include "scrub/xfile.h" /* * Online Scrub and Repair @@ -166,8 +168,6 @@ xchk_teardown( struct xfs_scrub *sc, int error) { - struct xfs_inode *ip_in = XFS_I(file_inode(sc->file)); - xchk_ag_free(sc, &sc->sa); if (sc->tp) { if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) @@ -178,14 +178,18 @@ xchk_teardown( } if (sc->ip) { if (sc->ilock_flags) - xfs_iunlock(sc->ip, sc->ilock_flags); - if (sc->ip != ip_in && - !xfs_internal_inum(sc->mp, sc->ip->i_ino)) - xchk_irele(sc, sc->ip); + xchk_iunlock(sc, sc->ilock_flags); + xchk_irele(sc, sc->ip); sc->ip = NULL; } - if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + if (sc->flags & XCHK_HAVE_FREEZE_PROT) { + sc->flags &= ~XCHK_HAVE_FREEZE_PROT; mnt_drop_write_file(sc->file); + } + if (sc->xfile) { + xfile_destroy(sc->xfile); + sc->xfile = NULL; + } if (sc->buf) { if (sc->buf_cleanup) sc->buf_cleanup(sc->buf); @@ -320,14 +324,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { }, [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ .type = ST_FS, - .setup = xchk_setup_rt, + .setup = xchk_setup_rtbitmap, .scrub = xchk_rtbitmap, .has = xfs_has_realtime, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ .type = ST_FS, - .setup = xchk_setup_rt, + .setup = xchk_setup_rtsummary, .scrub = xchk_rtsummary, .has = xfs_has_realtime, .repair = xrep_notsupported, @@ -407,6 +411,11 @@ xchk_validate_inputs( goto out; } + /* No rebuild without repair. */ + if ((sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) && + !(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) + return -EINVAL; + /* * We only want to repair read-write v5+ filesystems. Defer the check * for ops->repair until after our scrub confirms that we need to @@ -461,8 +470,10 @@ xfs_scrub_metadata( struct file *file, struct xfs_scrub_metadata *sm) { + struct xchk_stats_run run = { }; struct xfs_scrub *sc; struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount; + u64 check_start; int error = 0; BUILD_BUG_ON(sizeof(meta_scrub_ops) != @@ -505,6 +516,8 @@ retry_op: error = mnt_want_write_file(sc->file); if (error) goto out_sc; + + sc->flags |= XCHK_HAVE_FREEZE_PROT; } /* Set up for the operation. */ @@ -517,7 +530,9 @@ retry_op: goto out_teardown; /* Scrub for errors. */ + check_start = xchk_stats_now(); error = sc->ops->scrub(sc); + run.scrub_ns += xchk_stats_elapsed_ns(check_start); if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) goto try_harder; if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN)) @@ -529,15 +544,16 @@ retry_op: if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !(sc->flags & XREP_ALREADY_FIXED)) { - bool needs_fix; + bool needs_fix = xchk_needs_repair(sc->sm); + + /* Userspace asked us to rebuild the structure regardless. */ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) + needs_fix = true; /* Let debug users force us into the repair routines. */ - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) - sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + if (XFS_TEST_ERROR(needs_fix, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) + needs_fix = true; - needs_fix = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | - XFS_SCRUB_OFLAG_XCORRUPT | - XFS_SCRUB_OFLAG_PREEN)); /* * If userspace asked for a repair but it wasn't necessary, * report that back to userspace. @@ -551,7 +567,7 @@ retry_op: * If it's broken, userspace wants us to fix it, and we haven't * already tried to fix it, then attempt a repair. */ - error = xrep_attempt(sc); + error = xrep_attempt(sc, &run); if (error == -EAGAIN) { /* * Either the repair function succeeded or it couldn't @@ -572,6 +588,8 @@ out_nofix: out_teardown: error = xchk_teardown(sc, error); out_sc: + if (error != -ENOENT) + xchk_stats_merge(mp, sm, &run); kfree(sc); out: trace_xchk_done(XFS_I(file_inode(file)), sm, error); @@ -585,6 +603,7 @@ need_drain: if (error) goto out_sc; sc->flags |= XCHK_NEED_DRAIN; + run.retries++; goto retry_op; try_harder: /* @@ -596,5 +615,6 @@ try_harder: if (error) goto out_sc; sc->flags |= XCHK_TRY_HARDER; + run.retries++; goto retry_op; } diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index e113f2f5c254..1ef9c6b4842a 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -88,6 +88,10 @@ struct xfs_scrub { */ void (*buf_cleanup)(void *buf); + /* xfile used by the scrubbers; freed at teardown. */ + struct xfile *xfile; + + /* Lock flags for @ip. */ uint ilock_flags; /* See the XCHK/XREP state flags below. */ @@ -106,6 +110,7 @@ struct xfs_scrub { /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ #define XCHK_TRY_HARDER (1U << 0) /* can't get resources, try again */ +#define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */ #define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */ #define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */ #define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */ diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c new file mode 100644 index 000000000000..cd91db4a5548 --- /dev/null +++ b/fs/xfs/scrub/stats.c @@ -0,0 +1,408 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_sysfs.h" +#include "xfs_btree.h" +#include "xfs_super.h" +#include "scrub/scrub.h" +#include "scrub/stats.h" +#include "scrub/trace.h" + +struct xchk_scrub_stats { + /* all 32-bit counters here */ + + /* checking stats */ + uint32_t invocations; + uint32_t clean; + uint32_t corrupt; + uint32_t preen; + uint32_t xfail; + uint32_t xcorrupt; + uint32_t incomplete; + uint32_t warning; + uint32_t retries; + + /* repair stats */ + uint32_t repair_invocations; + uint32_t repair_success; + + /* all 64-bit items here */ + + /* runtimes */ + uint64_t checktime_us; + uint64_t repairtime_us; + + /* non-counter state must go at the end for clearall */ + spinlock_t css_lock; +}; + +struct xchk_stats { + struct dentry *cs_debugfs; + struct xchk_scrub_stats cs_stats[XFS_SCRUB_TYPE_NR]; +}; + + +static struct xchk_stats global_stats; + +static const char *name_map[XFS_SCRUB_TYPE_NR] = { + [XFS_SCRUB_TYPE_SB] = "sb", + [XFS_SCRUB_TYPE_AGF] = "agf", + [XFS_SCRUB_TYPE_AGFL] = "agfl", + [XFS_SCRUB_TYPE_AGI] = "agi", + [XFS_SCRUB_TYPE_BNOBT] = "bnobt", + [XFS_SCRUB_TYPE_CNTBT] = "cntbt", + [XFS_SCRUB_TYPE_INOBT] = "inobt", + [XFS_SCRUB_TYPE_FINOBT] = "finobt", + [XFS_SCRUB_TYPE_RMAPBT] = "rmapbt", + [XFS_SCRUB_TYPE_REFCNTBT] = "refcountbt", + [XFS_SCRUB_TYPE_INODE] = "inode", + [XFS_SCRUB_TYPE_BMBTD] = "bmapbtd", + [XFS_SCRUB_TYPE_BMBTA] = "bmapbta", + [XFS_SCRUB_TYPE_BMBTC] = "bmapbtc", + [XFS_SCRUB_TYPE_DIR] = "directory", + [XFS_SCRUB_TYPE_XATTR] = "xattr", + [XFS_SCRUB_TYPE_SYMLINK] = "symlink", + [XFS_SCRUB_TYPE_PARENT] = "parent", + [XFS_SCRUB_TYPE_RTBITMAP] = "rtbitmap", + [XFS_SCRUB_TYPE_RTSUM] = "rtsummary", + [XFS_SCRUB_TYPE_UQUOTA] = "usrquota", + [XFS_SCRUB_TYPE_GQUOTA] = "grpquota", + [XFS_SCRUB_TYPE_PQUOTA] = "prjquota", + [XFS_SCRUB_TYPE_FSCOUNTERS] = "fscounters", +}; + +/* Format the scrub stats into a text buffer, similar to pcp style. */ +STATIC ssize_t +xchk_stats_format( + struct xchk_stats *cs, + char *buf, + size_t remaining) +{ + struct xchk_scrub_stats *css = &cs->cs_stats[0]; + unsigned int i; + ssize_t copied = 0; + int ret = 0; + + for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) { + if (!name_map[i]) + continue; + + ret = scnprintf(buf, remaining, + "%s %u %u %u %u %u %u %u %u %u %llu %u %u %llu\n", + name_map[i], + (unsigned int)css->invocations, + (unsigned int)css->clean, + (unsigned int)css->corrupt, + (unsigned int)css->preen, + (unsigned int)css->xfail, + (unsigned int)css->xcorrupt, + (unsigned int)css->incomplete, + (unsigned int)css->warning, + (unsigned int)css->retries, + (unsigned long long)css->checktime_us, + (unsigned int)css->repair_invocations, + (unsigned int)css->repair_success, + (unsigned long long)css->repairtime_us); + if (ret <= 0) + break; + + remaining -= ret; + copied += ret; + buf += ret; + } + + return copied > 0 ? copied : ret; +} + +/* Estimate the worst case buffer size required to hold the whole report. */ +STATIC size_t +xchk_stats_estimate_bufsize( + struct xchk_stats *cs) +{ + struct xchk_scrub_stats *css = &cs->cs_stats[0]; + unsigned int i; + size_t field_width; + size_t ret = 0; + + /* 4294967296 plus one space for each u32 field */ + field_width = 11 * (offsetof(struct xchk_scrub_stats, checktime_us) / + sizeof(uint32_t)); + + /* 18446744073709551615 plus one space for each u64 field */ + field_width += 21 * ((offsetof(struct xchk_scrub_stats, css_lock) - + offsetof(struct xchk_scrub_stats, checktime_us)) / + sizeof(uint64_t)); + + for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) { + if (!name_map[i]) + continue; + + /* name plus one space */ + ret += 1 + strlen(name_map[i]); + + /* all fields, plus newline */ + ret += field_width + 1; + } + + return ret; +} + +/* Clear all counters. */ +STATIC void +xchk_stats_clearall( + struct xchk_stats *cs) +{ + struct xchk_scrub_stats *css = &cs->cs_stats[0]; + unsigned int i; + + for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) { + spin_lock(&css->css_lock); + memset(css, 0, offsetof(struct xchk_scrub_stats, css_lock)); + spin_unlock(&css->css_lock); + } +} + +#define XFS_SCRUB_OFLAG_UNCLEAN (XFS_SCRUB_OFLAG_CORRUPT | \ + XFS_SCRUB_OFLAG_PREEN | \ + XFS_SCRUB_OFLAG_XFAIL | \ + XFS_SCRUB_OFLAG_XCORRUPT | \ + XFS_SCRUB_OFLAG_INCOMPLETE | \ + XFS_SCRUB_OFLAG_WARNING) + +STATIC void +xchk_stats_merge_one( + struct xchk_stats *cs, + const struct xfs_scrub_metadata *sm, + const struct xchk_stats_run *run) +{ + struct xchk_scrub_stats *css; + + if (sm->sm_type >= XFS_SCRUB_TYPE_NR) { + ASSERT(sm->sm_type < XFS_SCRUB_TYPE_NR); + return; + } + + css = &cs->cs_stats[sm->sm_type]; + spin_lock(&css->css_lock); + css->invocations++; + if (!(sm->sm_flags & XFS_SCRUB_OFLAG_UNCLEAN)) + css->clean++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + css->corrupt++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_PREEN) + css->preen++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_XFAIL) + css->xfail++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT) + css->xcorrupt++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) + css->incomplete++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_WARNING) + css->warning++; + css->retries += run->retries; + css->checktime_us += howmany_64(run->scrub_ns, NSEC_PER_USEC); + + if (run->repair_attempted) + css->repair_invocations++; + if (run->repair_succeeded) + css->repair_success++; + css->repairtime_us += howmany_64(run->repair_ns, NSEC_PER_USEC); + spin_unlock(&css->css_lock); +} + +/* Merge these scrub-run stats into the global and mount stat data. */ +void +xchk_stats_merge( + struct xfs_mount *mp, + const struct xfs_scrub_metadata *sm, + const struct xchk_stats_run *run) +{ + xchk_stats_merge_one(&global_stats, sm, run); + xchk_stats_merge_one(mp->m_scrub_stats, sm, run); +} + +/* debugfs boilerplate */ + +static ssize_t +xchk_scrub_stats_read( + struct file *file, + char __user *ubuf, + size_t count, + loff_t *ppos) +{ + struct xchk_stats *cs = file->private_data; + char *buf; + size_t bufsize; + ssize_t avail, ret; + + /* + * This generates stringly snapshot of all the scrub counters, so we + * do not want userspace to receive garbled text from multiple calls. + * If the file position is greater than 0, return a short read. + */ + if (*ppos > 0) + return 0; + + bufsize = xchk_stats_estimate_bufsize(cs); + + buf = kvmalloc(bufsize, XCHK_GFP_FLAGS); + if (!buf) + return -ENOMEM; + + avail = xchk_stats_format(cs, buf, bufsize); + if (avail < 0) { + ret = avail; + goto out; + } + + ret = simple_read_from_buffer(ubuf, count, ppos, buf, avail); +out: + kvfree(buf); + return ret; +} + +static const struct file_operations scrub_stats_fops = { + .open = simple_open, + .read = xchk_scrub_stats_read, +}; + +static ssize_t +xchk_clear_scrub_stats_write( + struct file *file, + const char __user *ubuf, + size_t count, + loff_t *ppos) +{ + struct xchk_stats *cs = file->private_data; + unsigned int val; + int ret; + + ret = kstrtouint_from_user(ubuf, count, 0, &val); + if (ret) + return ret; + + if (val != 1) + return -EINVAL; + + xchk_stats_clearall(cs); + return count; +} + +static const struct file_operations clear_scrub_stats_fops = { + .open = simple_open, + .write = xchk_clear_scrub_stats_write, +}; + +/* Initialize the stats object. */ +STATIC int +xchk_stats_init( + struct xchk_stats *cs, + struct xfs_mount *mp) +{ + struct xchk_scrub_stats *css = &cs->cs_stats[0]; + unsigned int i; + + for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) + spin_lock_init(&css->css_lock); + + return 0; +} + +/* Connect the stats object to debugfs. */ +void +xchk_stats_register( + struct xchk_stats *cs, + struct dentry *parent) +{ + if (!parent) + return; + + cs->cs_debugfs = xfs_debugfs_mkdir("scrub", parent); + if (!cs->cs_debugfs) + return; + + debugfs_create_file("stats", 0644, cs->cs_debugfs, cs, + &scrub_stats_fops); + debugfs_create_file("clear_stats", 0400, cs->cs_debugfs, cs, + &clear_scrub_stats_fops); +} + +/* Free all resources related to the stats object. */ +STATIC int +xchk_stats_teardown( + struct xchk_stats *cs) +{ + return 0; +} + +/* Disconnect the stats object from debugfs. */ +void +xchk_stats_unregister( + struct xchk_stats *cs) +{ + debugfs_remove(cs->cs_debugfs); +} + +/* Initialize global stats and register them */ +int __init +xchk_global_stats_setup( + struct dentry *parent) +{ + int error; + + error = xchk_stats_init(&global_stats, NULL); + if (error) + return error; + + xchk_stats_register(&global_stats, parent); + return 0; +} + +/* Unregister global stats and tear them down */ +void +xchk_global_stats_teardown(void) +{ + xchk_stats_unregister(&global_stats); + xchk_stats_teardown(&global_stats); +} + +/* Allocate per-mount stats */ +int +xchk_mount_stats_alloc( + struct xfs_mount *mp) +{ + struct xchk_stats *cs; + int error; + + cs = kvzalloc(sizeof(struct xchk_stats), GFP_KERNEL); + if (!cs) + return -ENOMEM; + + error = xchk_stats_init(cs, mp); + if (error) + goto out_free; + + mp->m_scrub_stats = cs; + return 0; +out_free: + kvfree(cs); + return error; +} + +/* Free per-mount stats */ +void +xchk_mount_stats_free( + struct xfs_mount *mp) +{ + xchk_stats_teardown(mp->m_scrub_stats); + kvfree(mp->m_scrub_stats); + mp->m_scrub_stats = NULL; +} diff --git a/fs/xfs/scrub/stats.h b/fs/xfs/scrub/stats.h new file mode 100644 index 000000000000..b358ad8d8b90 --- /dev/null +++ b/fs/xfs/scrub/stats.h @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_STATS_H__ +#define __XFS_SCRUB_STATS_H__ + +struct xchk_stats_run { + u64 scrub_ns; + u64 repair_ns; + unsigned int retries; + bool repair_attempted; + bool repair_succeeded; +}; + +#ifdef CONFIG_XFS_ONLINE_SCRUB_STATS +struct xchk_stats; + +int __init xchk_global_stats_setup(struct dentry *parent); +void xchk_global_stats_teardown(void); + +int xchk_mount_stats_alloc(struct xfs_mount *mp); +void xchk_mount_stats_free(struct xfs_mount *mp); + +void xchk_stats_register(struct xchk_stats *cs, struct dentry *parent); +void xchk_stats_unregister(struct xchk_stats *cs); + +void xchk_stats_merge(struct xfs_mount *mp, const struct xfs_scrub_metadata *sm, + const struct xchk_stats_run *run); + +static inline u64 xchk_stats_now(void) { return ktime_get_ns(); } +static inline u64 xchk_stats_elapsed_ns(u64 since) +{ + u64 now = xchk_stats_now(); + + /* + * If the system doesn't have a high enough resolution clock, charge at + * least one nanosecond so that our stats don't report instantaneous + * runtimes. + */ + if (now == since) + return 1; + + return now - since; +} +#else +# define xchk_global_stats_setup(parent) (0) +# define xchk_global_stats_teardown() ((void)0) +# define xchk_mount_stats_alloc(mp) (0) +# define xchk_mount_stats_free(mp) ((void)0) +# define xchk_stats_register(cs, parent) ((void)0) +# define xchk_stats_unregister(cs) ((void)0) +# define xchk_stats_now() (0) +# define xchk_stats_elapsed_ns(x) (0 * (x)) +# define xchk_stats_merge(mp, sm, run) ((void)0) +#endif /* CONFIG_XFS_ONLINE_SCRUB_STATS */ + +#endif /* __XFS_SCRUB_STATS_H__ */ diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 0a975439d2b6..46249e7b17e0 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -12,8 +12,10 @@ #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" -#include "scrub/scrub.h" #include "xfs_ag.h" +#include "scrub/scrub.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" /* Figure out which block the btree cursor was pointing to. */ static inline xfs_fsblock_t diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index b3894daeb86a..cbd4d01e253c 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -16,6 +16,10 @@ #include <linux/tracepoint.h> #include "xfs_bit.h" +struct xfile; +struct xfarray; +struct xfarray_sortinfo; + /* * ftrace's __print_symbolic requires that all enum values be wrapped in the * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace @@ -94,10 +98,12 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); { XFS_SCRUB_OFLAG_XCORRUPT, "xcorrupt" }, \ { XFS_SCRUB_OFLAG_INCOMPLETE, "incomplete" }, \ { XFS_SCRUB_OFLAG_WARNING, "warning" }, \ - { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED, "norepair" } + { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED, "norepair" }, \ + { XFS_SCRUB_IFLAG_FORCE_REBUILD, "rebuild" } #define XFS_SCRUB_STATE_STRINGS \ { XCHK_TRY_HARDER, "try_harder" }, \ + { XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \ { XCHK_FSGATES_DRAIN, "fsgates_drain" }, \ { XCHK_NEED_DRAIN, "need_drain" }, \ { XREP_ALREADY_FIXED, "already_fixed" } @@ -635,6 +641,28 @@ TRACE_EVENT(xchk_iallocbt_check_cluster, __entry->cluster_ino) ) +TRACE_EVENT(xchk_inode_is_allocated, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned long, iflags) + __field(umode_t, mode) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->iflags = ip->i_flags; + __entry->mode = VFS_I(ip)->i_mode; + ), + TP_printk("dev %d:%d ino 0x%llx iflags 0x%lx mode 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->iflags, + __entry->mode) +); + TRACE_EVENT(xchk_fscounters_calc, TP_PROTO(struct xfs_mount *mp, uint64_t icount, uint64_t ifree, uint64_t fdblocks, uint64_t delalloc), @@ -693,6 +721,31 @@ TRACE_EVENT(xchk_fscounters_within_range, __entry->old_value) ) +DECLARE_EVENT_CLASS(xchk_fsfreeze_class, + TP_PROTO(struct xfs_scrub *sc, int error), + TP_ARGS(sc, error), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(int, error) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->error = error; + ), + TP_printk("dev %d:%d type %s error %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __entry->error) +); +#define DEFINE_XCHK_FSFREEZE_EVENT(name) \ +DEFINE_EVENT(xchk_fsfreeze_class, name, \ + TP_PROTO(struct xfs_scrub *sc, int error), \ + TP_ARGS(sc, error)) +DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsfreeze); +DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsthaw); + TRACE_EVENT(xchk_refcount_incorrect, TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *irec, xfs_nlink_t seen), @@ -725,13 +778,302 @@ TRACE_EVENT(xchk_refcount_incorrect, __entry->seen) ) +TRACE_EVENT(xfile_create, + TP_PROTO(struct xfile *xf), + TP_ARGS(xf), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, ino) + __array(char, pathname, 256) + ), + TP_fast_assign( + char pathname[257]; + char *path; + + __entry->ino = file_inode(xf->file)->i_ino; + memset(pathname, 0, sizeof(pathname)); + path = file_path(xf->file, pathname, sizeof(pathname) - 1); + if (IS_ERR(path)) + path = "(unknown)"; + strncpy(__entry->pathname, path, sizeof(__entry->pathname)); + ), + TP_printk("xfino 0x%lx path '%s'", + __entry->ino, + __entry->pathname) +); + +TRACE_EVENT(xfile_destroy, + TP_PROTO(struct xfile *xf), + TP_ARGS(xf), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, bytes) + __field(loff_t, size) + ), + TP_fast_assign( + struct xfile_stat statbuf; + int ret; + + ret = xfile_stat(xf, &statbuf); + if (!ret) { + __entry->bytes = statbuf.bytes; + __entry->size = statbuf.size; + } else { + __entry->bytes = -1; + __entry->size = -1; + } + __entry->ino = file_inode(xf->file)->i_ino; + ), + TP_printk("xfino 0x%lx mem_bytes 0x%llx isize 0x%llx", + __entry->ino, + __entry->bytes, + __entry->size) +); + +DECLARE_EVENT_CLASS(xfile_class, + TP_PROTO(struct xfile *xf, loff_t pos, unsigned long long bytecount), + TP_ARGS(xf, pos, bytecount), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, bytes_used) + __field(loff_t, pos) + __field(loff_t, size) + __field(unsigned long long, bytecount) + ), + TP_fast_assign( + struct xfile_stat statbuf; + int ret; + + ret = xfile_stat(xf, &statbuf); + if (!ret) { + __entry->bytes_used = statbuf.bytes; + __entry->size = statbuf.size; + } else { + __entry->bytes_used = -1; + __entry->size = -1; + } + __entry->ino = file_inode(xf->file)->i_ino; + __entry->pos = pos; + __entry->bytecount = bytecount; + ), + TP_printk("xfino 0x%lx mem_bytes 0x%llx pos 0x%llx bytecount 0x%llx isize 0x%llx", + __entry->ino, + __entry->bytes_used, + __entry->pos, + __entry->bytecount, + __entry->size) +); +#define DEFINE_XFILE_EVENT(name) \ +DEFINE_EVENT(xfile_class, name, \ + TP_PROTO(struct xfile *xf, loff_t pos, unsigned long long bytecount), \ + TP_ARGS(xf, pos, bytecount)) +DEFINE_XFILE_EVENT(xfile_pread); +DEFINE_XFILE_EVENT(xfile_pwrite); +DEFINE_XFILE_EVENT(xfile_seek_data); +DEFINE_XFILE_EVENT(xfile_get_page); +DEFINE_XFILE_EVENT(xfile_put_page); + +TRACE_EVENT(xfarray_create, + TP_PROTO(struct xfarray *xfa, unsigned long long required_capacity), + TP_ARGS(xfa, required_capacity), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(uint64_t, max_nr) + __field(size_t, obj_size) + __field(int, obj_size_log) + __field(unsigned long long, required_capacity) + ), + TP_fast_assign( + __entry->max_nr = xfa->max_nr; + __entry->obj_size = xfa->obj_size; + __entry->obj_size_log = xfa->obj_size_log; + __entry->ino = file_inode(xfa->xfile->file)->i_ino; + __entry->required_capacity = required_capacity; + ), + TP_printk("xfino 0x%lx max_nr %llu reqd_nr %llu objsz %zu objszlog %d", + __entry->ino, + __entry->max_nr, + __entry->required_capacity, + __entry->obj_size, + __entry->obj_size_log) +); + +TRACE_EVENT(xfarray_isort, + TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi), + TP_ARGS(si, lo, hi), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, lo) + __field(unsigned long long, hi) + ), + TP_fast_assign( + __entry->ino = file_inode(si->array->xfile->file)->i_ino; + __entry->lo = lo; + __entry->hi = hi; + ), + TP_printk("xfino 0x%lx lo %llu hi %llu elts %llu", + __entry->ino, + __entry->lo, + __entry->hi, + __entry->hi - __entry->lo) +); + +TRACE_EVENT(xfarray_pagesort, + TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi), + TP_ARGS(si, lo, hi), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, lo) + __field(unsigned long long, hi) + ), + TP_fast_assign( + __entry->ino = file_inode(si->array->xfile->file)->i_ino; + __entry->lo = lo; + __entry->hi = hi; + ), + TP_printk("xfino 0x%lx lo %llu hi %llu elts %llu", + __entry->ino, + __entry->lo, + __entry->hi, + __entry->hi - __entry->lo) +); + +TRACE_EVENT(xfarray_qsort, + TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi), + TP_ARGS(si, lo, hi), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, lo) + __field(unsigned long long, hi) + __field(int, stack_depth) + __field(int, max_stack_depth) + ), + TP_fast_assign( + __entry->ino = file_inode(si->array->xfile->file)->i_ino; + __entry->lo = lo; + __entry->hi = hi; + __entry->stack_depth = si->stack_depth; + __entry->max_stack_depth = si->max_stack_depth; + ), + TP_printk("xfino 0x%lx lo %llu hi %llu elts %llu stack %d/%d", + __entry->ino, + __entry->lo, + __entry->hi, + __entry->hi - __entry->lo, + __entry->stack_depth, + __entry->max_stack_depth) +); + +TRACE_EVENT(xfarray_sort, + TP_PROTO(struct xfarray_sortinfo *si, size_t bytes), + TP_ARGS(si, bytes), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, nr) + __field(size_t, obj_size) + __field(size_t, bytes) + __field(unsigned int, max_stack_depth) + ), + TP_fast_assign( + __entry->nr = si->array->nr; + __entry->obj_size = si->array->obj_size; + __entry->ino = file_inode(si->array->xfile->file)->i_ino; + __entry->bytes = bytes; + __entry->max_stack_depth = si->max_stack_depth; + ), + TP_printk("xfino 0x%lx nr %llu objsz %zu stack %u bytes %zu", + __entry->ino, + __entry->nr, + __entry->obj_size, + __entry->max_stack_depth, + __entry->bytes) +); + +TRACE_EVENT(xfarray_sort_stats, + TP_PROTO(struct xfarray_sortinfo *si, int error), + TP_ARGS(si, error), + TP_STRUCT__entry( + __field(unsigned long, ino) +#ifdef DEBUG + __field(unsigned long long, loads) + __field(unsigned long long, stores) + __field(unsigned long long, compares) + __field(unsigned long long, heapsorts) +#endif + __field(unsigned int, max_stack_depth) + __field(unsigned int, max_stack_used) + __field(int, error) + ), + TP_fast_assign( + __entry->ino = file_inode(si->array->xfile->file)->i_ino; +#ifdef DEBUG + __entry->loads = si->loads; + __entry->stores = si->stores; + __entry->compares = si->compares; + __entry->heapsorts = si->heapsorts; +#endif + __entry->max_stack_depth = si->max_stack_depth; + __entry->max_stack_used = si->max_stack_used; + __entry->error = error; + ), + TP_printk( +#ifdef DEBUG + "xfino 0x%lx loads %llu stores %llu compares %llu heapsorts %llu stack_depth %u/%u error %d", +#else + "xfino 0x%lx stack_depth %u/%u error %d", +#endif + __entry->ino, +#ifdef DEBUG + __entry->loads, + __entry->stores, + __entry->compares, + __entry->heapsorts, +#endif + __entry->max_stack_used, + __entry->max_stack_depth, + __entry->error) +); + +#ifdef CONFIG_XFS_RT +TRACE_EVENT(xchk_rtsum_record_free, + TP_PROTO(struct xfs_mount *mp, xfs_rtblock_t start, + uint64_t len, unsigned int log, loff_t pos, xfs_suminfo_t v), + TP_ARGS(mp, start, len, log, pos, v), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, rtdev) + __field(xfs_rtblock_t, start) + __field(unsigned long long, len) + __field(unsigned int, log) + __field(loff_t, pos) + __field(xfs_suminfo_t, v) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->rtdev = mp->m_rtdev_targp->bt_dev; + __entry->start = start; + __entry->len = len; + __entry->log = log; + __entry->pos = pos; + __entry->v = v; + ), + TP_printk("dev %d:%d rtdev %d:%d rtx 0x%llx rtxcount 0x%llx log %u rsumpos 0x%llx sumcount %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->rtdev), MINOR(__entry->rtdev), + __entry->start, + __entry->len, + __entry->log, + __entry->pos, + __entry->v) +); +#endif /* CONFIG_XFS_RT */ + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) DECLARE_EVENT_CLASS(xrep_extent_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno, xfs_extlen_t len), - TP_ARGS(mp, agno, agbno, len), + TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(pag, agbno, len), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -739,8 +1081,8 @@ DECLARE_EVENT_CLASS(xrep_extent_class, __field(xfs_extlen_t, len) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; __entry->agbno = agbno; __entry->len = len; ), @@ -752,12 +1094,45 @@ DECLARE_EVENT_CLASS(xrep_extent_class, ); #define DEFINE_REPAIR_EXTENT_EVENT(name) \ DEFINE_EVENT(xrep_extent_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - xfs_agblock_t agbno, xfs_extlen_t len), \ - TP_ARGS(mp, agno, agbno, len)) -DEFINE_REPAIR_EXTENT_EVENT(xrep_dispose_btree_extent); + TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len), \ + TP_ARGS(pag, agbno, len)) +DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_unmap_extent); +DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_free_extent); +DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval); DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert); +DECLARE_EVENT_CLASS(xrep_reap_find_class, + TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len, + bool crosslinked), + TP_ARGS(pag, agbno, len, crosslinked), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(bool, crosslinked) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->crosslinked = crosslinked; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x crosslinked %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->crosslinked ? 1 : 0) +); +#define DEFINE_REPAIR_REAP_FIND_EVENT(name) \ +DEFINE_EVENT(xrep_reap_find_class, name, \ + TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len, \ + bool crosslinked), \ + TP_ARGS(pag, agbno, len, crosslinked)) +DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select); + DECLARE_EVENT_CLASS(xrep_rmap_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t len, @@ -827,28 +1202,6 @@ TRACE_EVENT(xrep_refcount_extent_fn, __entry->refcount) ) -TRACE_EVENT(xrep_init_btblock, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_btnum_t btnum), - TP_ARGS(mp, agno, agbno, btnum), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_agnumber_t, agno) - __field(xfs_agblock_t, agbno) - __field(uint32_t, btnum) - ), - TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->agbno = agbno; - __entry->btnum = btnum; - ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x btree %s", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->agno, - __entry->agbno, - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS)) -) TRACE_EVENT(xrep_findroot_block, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, uint32_t magic, uint16_t level), diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c new file mode 100644 index 000000000000..f0f532c10a5a --- /dev/null +++ b/fs/xfs/scrub/xfarray.c @@ -0,0 +1,1083 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2021-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/scrub.h" +#include "scrub/trace.h" + +/* + * Large Arrays of Fixed-Size Records + * ================================== + * + * This memory array uses an xfile (which itself is a memfd "file") to store + * large numbers of fixed-size records in memory that can be paged out. This + * puts less stress on the memory reclaim algorithms during an online repair + * because we don't have to pin so much memory. However, array access is less + * direct than would be in a regular memory array. Access to the array is + * performed via indexed load and store methods, and an append method is + * provided for convenience. Array elements can be unset, which sets them to + * all zeroes. Unset entries are skipped during iteration, though direct loads + * will return a zeroed buffer. Callers are responsible for concurrency + * control. + */ + +/* + * Pointer to scratch space. Because we can't access the xfile data directly, + * we allocate a small amount of memory on the end of the xfarray structure to + * buffer array items when we need space to store values temporarily. + */ +static inline void *xfarray_scratch(struct xfarray *array) +{ + return (array + 1); +} + +/* Compute array index given an xfile offset. */ +static xfarray_idx_t +xfarray_idx( + struct xfarray *array, + loff_t pos) +{ + if (array->obj_size_log >= 0) + return (xfarray_idx_t)pos >> array->obj_size_log; + + return div_u64((xfarray_idx_t)pos, array->obj_size); +} + +/* Compute xfile offset of array element. */ +static inline loff_t xfarray_pos(struct xfarray *array, xfarray_idx_t idx) +{ + if (array->obj_size_log >= 0) + return idx << array->obj_size_log; + + return idx * array->obj_size; +} + +/* + * Initialize a big memory array. Array records cannot be larger than a + * page, and the array cannot span more bytes than the page cache supports. + * If @required_capacity is nonzero, the maximum array size will be set to this + * quantity and the array creation will fail if the underlying storage cannot + * support that many records. + */ +int +xfarray_create( + const char *description, + unsigned long long required_capacity, + size_t obj_size, + struct xfarray **arrayp) +{ + struct xfarray *array; + struct xfile *xfile; + int error; + + ASSERT(obj_size < PAGE_SIZE); + + error = xfile_create(description, 0, &xfile); + if (error) + return error; + + error = -ENOMEM; + array = kzalloc(sizeof(struct xfarray) + obj_size, XCHK_GFP_FLAGS); + if (!array) + goto out_xfile; + + array->xfile = xfile; + array->obj_size = obj_size; + + if (is_power_of_2(obj_size)) + array->obj_size_log = ilog2(obj_size); + else + array->obj_size_log = -1; + + array->max_nr = xfarray_idx(array, MAX_LFS_FILESIZE); + trace_xfarray_create(array, required_capacity); + + if (required_capacity > 0) { + if (array->max_nr < required_capacity) { + error = -ENOMEM; + goto out_xfarray; + } + array->max_nr = required_capacity; + } + + *arrayp = array; + return 0; + +out_xfarray: + kfree(array); +out_xfile: + xfile_destroy(xfile); + return error; +} + +/* Destroy the array. */ +void +xfarray_destroy( + struct xfarray *array) +{ + xfile_destroy(array->xfile); + kfree(array); +} + +/* Load an element from the array. */ +int +xfarray_load( + struct xfarray *array, + xfarray_idx_t idx, + void *ptr) +{ + if (idx >= array->nr) + return -ENODATA; + + return xfile_obj_load(array->xfile, ptr, array->obj_size, + xfarray_pos(array, idx)); +} + +/* Is this array element potentially unset? */ +static inline bool +xfarray_is_unset( + struct xfarray *array, + loff_t pos) +{ + void *temp = xfarray_scratch(array); + int error; + + if (array->unset_slots == 0) + return false; + + error = xfile_obj_load(array->xfile, temp, array->obj_size, pos); + if (!error && xfarray_element_is_null(array, temp)) + return true; + + return false; +} + +/* + * Unset an array element. If @idx is the last element in the array, the + * array will be truncated. Otherwise, the entry will be zeroed. + */ +int +xfarray_unset( + struct xfarray *array, + xfarray_idx_t idx) +{ + void *temp = xfarray_scratch(array); + loff_t pos = xfarray_pos(array, idx); + int error; + + if (idx >= array->nr) + return -ENODATA; + + if (idx == array->nr - 1) { + array->nr--; + return 0; + } + + if (xfarray_is_unset(array, pos)) + return 0; + + memset(temp, 0, array->obj_size); + error = xfile_obj_store(array->xfile, temp, array->obj_size, pos); + if (error) + return error; + + array->unset_slots++; + return 0; +} + +/* + * Store an element in the array. The element must not be completely zeroed, + * because those are considered unset sparse elements. + */ +int +xfarray_store( + struct xfarray *array, + xfarray_idx_t idx, + const void *ptr) +{ + int ret; + + if (idx >= array->max_nr) + return -EFBIG; + + ASSERT(!xfarray_element_is_null(array, ptr)); + + ret = xfile_obj_store(array->xfile, ptr, array->obj_size, + xfarray_pos(array, idx)); + if (ret) + return ret; + + array->nr = max(array->nr, idx + 1); + return 0; +} + +/* Is this array element NULL? */ +bool +xfarray_element_is_null( + struct xfarray *array, + const void *ptr) +{ + return !memchr_inv(ptr, 0, array->obj_size); +} + +/* + * Store an element anywhere in the array that is unset. If there are no + * unset slots, append the element to the array. + */ +int +xfarray_store_anywhere( + struct xfarray *array, + const void *ptr) +{ + void *temp = xfarray_scratch(array); + loff_t endpos = xfarray_pos(array, array->nr); + loff_t pos; + int error; + + /* Find an unset slot to put it in. */ + for (pos = 0; + pos < endpos && array->unset_slots > 0; + pos += array->obj_size) { + error = xfile_obj_load(array->xfile, temp, array->obj_size, + pos); + if (error || !xfarray_element_is_null(array, temp)) + continue; + + error = xfile_obj_store(array->xfile, ptr, array->obj_size, + pos); + if (error) + return error; + + array->unset_slots--; + return 0; + } + + /* No unset slots found; attach it on the end. */ + array->unset_slots = 0; + return xfarray_append(array, ptr); +} + +/* Return length of array. */ +uint64_t +xfarray_length( + struct xfarray *array) +{ + return array->nr; +} + +/* + * Decide which array item we're going to read as part of an _iter_get. + * @cur is the array index, and @pos is the file offset of that array index in + * the backing xfile. Returns ENODATA if we reach the end of the records. + * + * Reading from a hole in a sparse xfile causes page instantiation, so for + * iterating a (possibly sparse) array we need to figure out if the cursor is + * pointing at a totally uninitialized hole and move the cursor up if + * necessary. + */ +static inline int +xfarray_find_data( + struct xfarray *array, + xfarray_idx_t *cur, + loff_t *pos) +{ + unsigned int pgoff = offset_in_page(*pos); + loff_t end_pos = *pos + array->obj_size - 1; + loff_t new_pos; + + /* + * If the current array record is not adjacent to a page boundary, we + * are in the middle of the page. We do not need to move the cursor. + */ + if (pgoff != 0 && pgoff + array->obj_size - 1 < PAGE_SIZE) + return 0; + + /* + * Call SEEK_DATA on the last byte in the record we're about to read. + * If the record ends at (or crosses) the end of a page then we know + * that the first byte of the record is backed by pages and don't need + * to query it. If instead the record begins at the start of the page + * then we know that querying the last byte is just as good as querying + * the first byte, since records cannot be larger than a page. + * + * If the call returns the same file offset, we know this record is + * backed by real pages. We do not need to move the cursor. + */ + new_pos = xfile_seek_data(array->xfile, end_pos); + if (new_pos == -ENXIO) + return -ENODATA; + if (new_pos < 0) + return new_pos; + if (new_pos == end_pos) + return 0; + + /* + * Otherwise, SEEK_DATA told us how far up to move the file pointer to + * find more data. Move the array index to the first record past the + * byte offset we were given. + */ + new_pos = roundup_64(new_pos, array->obj_size); + *cur = xfarray_idx(array, new_pos); + *pos = xfarray_pos(array, *cur); + return 0; +} + +/* + * Starting at *idx, fetch the next non-null array entry and advance the index + * to set up the next _load_next call. Returns ENODATA if we reach the end of + * the array. Callers must set @*idx to XFARRAY_CURSOR_INIT before the first + * call to this function. + */ +int +xfarray_load_next( + struct xfarray *array, + xfarray_idx_t *idx, + void *rec) +{ + xfarray_idx_t cur = *idx; + loff_t pos = xfarray_pos(array, cur); + int error; + + do { + if (cur >= array->nr) + return -ENODATA; + + /* + * Ask the backing store for the location of next possible + * written record, then retrieve that record. + */ + error = xfarray_find_data(array, &cur, &pos); + if (error) + return error; + error = xfarray_load(array, cur, rec); + if (error) + return error; + + cur++; + pos += array->obj_size; + } while (xfarray_element_is_null(array, rec)); + + *idx = cur; + return 0; +} + +/* Sorting functions */ + +#ifdef DEBUG +# define xfarray_sort_bump_loads(si) do { (si)->loads++; } while (0) +# define xfarray_sort_bump_stores(si) do { (si)->stores++; } while (0) +# define xfarray_sort_bump_compares(si) do { (si)->compares++; } while (0) +# define xfarray_sort_bump_heapsorts(si) do { (si)->heapsorts++; } while (0) +#else +# define xfarray_sort_bump_loads(si) +# define xfarray_sort_bump_stores(si) +# define xfarray_sort_bump_compares(si) +# define xfarray_sort_bump_heapsorts(si) +#endif /* DEBUG */ + +/* Load an array element for sorting. */ +static inline int +xfarray_sort_load( + struct xfarray_sortinfo *si, + xfarray_idx_t idx, + void *ptr) +{ + xfarray_sort_bump_loads(si); + return xfarray_load(si->array, idx, ptr); +} + +/* Store an array element for sorting. */ +static inline int +xfarray_sort_store( + struct xfarray_sortinfo *si, + xfarray_idx_t idx, + void *ptr) +{ + xfarray_sort_bump_stores(si); + return xfarray_store(si->array, idx, ptr); +} + +/* Compare an array element for sorting. */ +static inline int +xfarray_sort_cmp( + struct xfarray_sortinfo *si, + const void *a, + const void *b) +{ + xfarray_sort_bump_compares(si); + return si->cmp_fn(a, b); +} + +/* Return a pointer to the low index stack for quicksort partitioning. */ +static inline xfarray_idx_t *xfarray_sortinfo_lo(struct xfarray_sortinfo *si) +{ + return (xfarray_idx_t *)(si + 1); +} + +/* Return a pointer to the high index stack for quicksort partitioning. */ +static inline xfarray_idx_t *xfarray_sortinfo_hi(struct xfarray_sortinfo *si) +{ + return xfarray_sortinfo_lo(si) + si->max_stack_depth; +} + +/* Size of each element in the quicksort pivot array. */ +static inline size_t +xfarray_pivot_rec_sz( + struct xfarray *array) +{ + return round_up(array->obj_size, 8) + sizeof(xfarray_idx_t); +} + +/* Allocate memory to handle the sort. */ +static inline int +xfarray_sortinfo_alloc( + struct xfarray *array, + xfarray_cmp_fn cmp_fn, + unsigned int flags, + struct xfarray_sortinfo **infop) +{ + struct xfarray_sortinfo *si; + size_t nr_bytes = sizeof(struct xfarray_sortinfo); + size_t pivot_rec_sz = xfarray_pivot_rec_sz(array); + int max_stack_depth; + + /* + * The median-of-nine pivot algorithm doesn't work if a subset has + * fewer than 9 items. Make sure the in-memory sort will always take + * over for subsets where this wouldn't be the case. + */ + BUILD_BUG_ON(XFARRAY_QSORT_PIVOT_NR >= XFARRAY_ISORT_NR); + + /* + * Tail-call recursion during the partitioning phase means that + * quicksort will never recurse more than log2(nr) times. We need one + * extra level of stack to hold the initial parameters. In-memory + * sort will always take care of the last few levels of recursion for + * us, so we can reduce the stack depth by that much. + */ + max_stack_depth = ilog2(array->nr) + 1 - (XFARRAY_ISORT_SHIFT - 1); + if (max_stack_depth < 1) + max_stack_depth = 1; + + /* Each level of quicksort uses a lo and a hi index */ + nr_bytes += max_stack_depth * sizeof(xfarray_idx_t) * 2; + + /* Scratchpad for in-memory sort, or finding the pivot */ + nr_bytes += max_t(size_t, + (XFARRAY_QSORT_PIVOT_NR + 1) * pivot_rec_sz, + XFARRAY_ISORT_NR * array->obj_size); + + si = kvzalloc(nr_bytes, XCHK_GFP_FLAGS); + if (!si) + return -ENOMEM; + + si->array = array; + si->cmp_fn = cmp_fn; + si->flags = flags; + si->max_stack_depth = max_stack_depth; + si->max_stack_used = 1; + + xfarray_sortinfo_lo(si)[0] = 0; + xfarray_sortinfo_hi(si)[0] = array->nr - 1; + + trace_xfarray_sort(si, nr_bytes); + *infop = si; + return 0; +} + +/* Should this sort be terminated by a fatal signal? */ +static inline bool +xfarray_sort_terminated( + struct xfarray_sortinfo *si, + int *error) +{ + /* + * If preemption is disabled, we need to yield to the scheduler every + * few seconds so that we don't run afoul of the soft lockup watchdog + * or RCU stall detector. + */ + cond_resched(); + + if ((si->flags & XFARRAY_SORT_KILLABLE) && + fatal_signal_pending(current)) { + if (*error == 0) + *error = -EINTR; + return true; + } + return false; +} + +/* Do we want an in-memory sort? */ +static inline bool +xfarray_want_isort( + struct xfarray_sortinfo *si, + xfarray_idx_t start, + xfarray_idx_t end) +{ + /* + * For array subsets that fit in the scratchpad, it's much faster to + * use the kernel's heapsort than quicksort's stack machine. + */ + return (end - start) < XFARRAY_ISORT_NR; +} + +/* Return the scratch space within the sortinfo structure. */ +static inline void *xfarray_sortinfo_isort_scratch(struct xfarray_sortinfo *si) +{ + return xfarray_sortinfo_hi(si) + si->max_stack_depth; +} + +/* + * Sort a small number of array records using scratchpad memory. The records + * need not be contiguous in the xfile's memory pages. + */ +STATIC int +xfarray_isort( + struct xfarray_sortinfo *si, + xfarray_idx_t lo, + xfarray_idx_t hi) +{ + void *scratch = xfarray_sortinfo_isort_scratch(si); + loff_t lo_pos = xfarray_pos(si->array, lo); + loff_t len = xfarray_pos(si->array, hi - lo + 1); + int error; + + trace_xfarray_isort(si, lo, hi); + + xfarray_sort_bump_loads(si); + error = xfile_obj_load(si->array->xfile, scratch, len, lo_pos); + if (error) + return error; + + xfarray_sort_bump_heapsorts(si); + sort(scratch, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL); + + xfarray_sort_bump_stores(si); + return xfile_obj_store(si->array->xfile, scratch, len, lo_pos); +} + +/* Grab a page for sorting records. */ +static inline int +xfarray_sort_get_page( + struct xfarray_sortinfo *si, + loff_t pos, + uint64_t len) +{ + int error; + + error = xfile_get_page(si->array->xfile, pos, len, &si->xfpage); + if (error) + return error; + + /* + * xfile pages must never be mapped into userspace, so we skip the + * dcache flush when mapping the page. + */ + si->page_kaddr = kmap_local_page(si->xfpage.page); + return 0; +} + +/* Release a page we grabbed for sorting records. */ +static inline int +xfarray_sort_put_page( + struct xfarray_sortinfo *si) +{ + if (!si->page_kaddr) + return 0; + + kunmap_local(si->page_kaddr); + si->page_kaddr = NULL; + + return xfile_put_page(si->array->xfile, &si->xfpage); +} + +/* Decide if these records are eligible for in-page sorting. */ +static inline bool +xfarray_want_pagesort( + struct xfarray_sortinfo *si, + xfarray_idx_t lo, + xfarray_idx_t hi) +{ + pgoff_t lo_page; + pgoff_t hi_page; + loff_t end_pos; + + /* We can only map one page at a time. */ + lo_page = xfarray_pos(si->array, lo) >> PAGE_SHIFT; + end_pos = xfarray_pos(si->array, hi) + si->array->obj_size - 1; + hi_page = end_pos >> PAGE_SHIFT; + + return lo_page == hi_page; +} + +/* Sort a bunch of records that all live in the same memory page. */ +STATIC int +xfarray_pagesort( + struct xfarray_sortinfo *si, + xfarray_idx_t lo, + xfarray_idx_t hi) +{ + void *startp; + loff_t lo_pos = xfarray_pos(si->array, lo); + uint64_t len = xfarray_pos(si->array, hi - lo); + int error = 0; + + trace_xfarray_pagesort(si, lo, hi); + + xfarray_sort_bump_loads(si); + error = xfarray_sort_get_page(si, lo_pos, len); + if (error) + return error; + + xfarray_sort_bump_heapsorts(si); + startp = si->page_kaddr + offset_in_page(lo_pos); + sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL); + + xfarray_sort_bump_stores(si); + return xfarray_sort_put_page(si); +} + +/* Return a pointer to the xfarray pivot record within the sortinfo struct. */ +static inline void *xfarray_sortinfo_pivot(struct xfarray_sortinfo *si) +{ + return xfarray_sortinfo_hi(si) + si->max_stack_depth; +} + +/* Return a pointer to the start of the pivot array. */ +static inline void * +xfarray_sortinfo_pivot_array( + struct xfarray_sortinfo *si) +{ + return xfarray_sortinfo_pivot(si) + si->array->obj_size; +} + +/* The xfarray record is stored at the start of each pivot array element. */ +static inline void * +xfarray_pivot_array_rec( + void *pa, + size_t pa_recsz, + unsigned int pa_idx) +{ + return pa + (pa_recsz * pa_idx); +} + +/* The xfarray index is stored at the end of each pivot array element. */ +static inline xfarray_idx_t * +xfarray_pivot_array_idx( + void *pa, + size_t pa_recsz, + unsigned int pa_idx) +{ + return xfarray_pivot_array_rec(pa, pa_recsz, pa_idx + 1) - + sizeof(xfarray_idx_t); +} + +/* + * Find a pivot value for quicksort partitioning, swap it with a[lo], and save + * the cached pivot record for the next step. + * + * Load evenly-spaced records within the given range into memory, sort them, + * and choose the pivot from the median record. Using multiple points will + * improve the quality of the pivot selection, and hopefully avoid the worst + * quicksort behavior, since our array values are nearly always evenly sorted. + */ +STATIC int +xfarray_qsort_pivot( + struct xfarray_sortinfo *si, + xfarray_idx_t lo, + xfarray_idx_t hi) +{ + void *pivot = xfarray_sortinfo_pivot(si); + void *parray = xfarray_sortinfo_pivot_array(si); + void *recp; + xfarray_idx_t *idxp; + xfarray_idx_t step = (hi - lo) / (XFARRAY_QSORT_PIVOT_NR - 1); + size_t pivot_rec_sz = xfarray_pivot_rec_sz(si->array); + int i, j; + int error; + + ASSERT(step > 0); + + /* + * Load the xfarray indexes of the records we intend to sample into the + * pivot array. + */ + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, 0); + *idxp = lo; + for (i = 1; i < XFARRAY_QSORT_PIVOT_NR - 1; i++) { + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i); + *idxp = lo + (i * step); + } + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, + XFARRAY_QSORT_PIVOT_NR - 1); + *idxp = hi; + + /* Load the selected xfarray records into the pivot array. */ + for (i = 0; i < XFARRAY_QSORT_PIVOT_NR; i++) { + xfarray_idx_t idx; + + recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, i); + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i); + + /* No unset records; load directly into the array. */ + if (likely(si->array->unset_slots == 0)) { + error = xfarray_sort_load(si, *idxp, recp); + if (error) + return error; + continue; + } + + /* + * Load non-null records into the scratchpad without changing + * the xfarray_idx_t in the pivot array. + */ + idx = *idxp; + xfarray_sort_bump_loads(si); + error = xfarray_load_next(si->array, &idx, recp); + if (error) + return error; + } + + xfarray_sort_bump_heapsorts(si); + sort(parray, XFARRAY_QSORT_PIVOT_NR, pivot_rec_sz, si->cmp_fn, NULL); + + /* + * We sorted the pivot array records (which includes the xfarray + * indices) in xfarray record order. The median element of the pivot + * array contains the xfarray record that we will use as the pivot. + * Copy that xfarray record to the designated space. + */ + recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, + XFARRAY_QSORT_PIVOT_NR / 2); + memcpy(pivot, recp, si->array->obj_size); + + /* If the pivot record we chose was already in a[lo] then we're done. */ + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, + XFARRAY_QSORT_PIVOT_NR / 2); + if (*idxp == lo) + return 0; + + /* + * Find the cached copy of a[lo] in the pivot array so that we can swap + * a[lo] and a[pivot]. + */ + for (i = 0, j = -1; i < XFARRAY_QSORT_PIVOT_NR; i++) { + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i); + if (*idxp == lo) + j = i; + } + if (j < 0) { + ASSERT(j >= 0); + return -EFSCORRUPTED; + } + + /* Swap a[lo] and a[pivot]. */ + error = xfarray_sort_store(si, lo, pivot); + if (error) + return error; + + recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, j); + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, + XFARRAY_QSORT_PIVOT_NR / 2); + return xfarray_sort_store(si, *idxp, recp); +} + +/* + * Set up the pointers for the next iteration. We push onto the stack all of + * the unsorted values between a[lo + 1] and a[end[i]], and we tweak the + * current stack frame to point to the unsorted values between a[beg[i]] and + * a[lo] so that those values will be sorted when we pop the stack. + */ +static inline int +xfarray_qsort_push( + struct xfarray_sortinfo *si, + xfarray_idx_t *si_lo, + xfarray_idx_t *si_hi, + xfarray_idx_t lo, + xfarray_idx_t hi) +{ + /* Check for stack overflows */ + if (si->stack_depth >= si->max_stack_depth - 1) { + ASSERT(si->stack_depth < si->max_stack_depth - 1); + return -EFSCORRUPTED; + } + + si->max_stack_used = max_t(uint8_t, si->max_stack_used, + si->stack_depth + 2); + + si_lo[si->stack_depth + 1] = lo + 1; + si_hi[si->stack_depth + 1] = si_hi[si->stack_depth]; + si_hi[si->stack_depth++] = lo - 1; + + /* + * Always start with the smaller of the two partitions to keep the + * amount of recursion in check. + */ + if (si_hi[si->stack_depth] - si_lo[si->stack_depth] > + si_hi[si->stack_depth - 1] - si_lo[si->stack_depth - 1]) { + swap(si_lo[si->stack_depth], si_lo[si->stack_depth - 1]); + swap(si_hi[si->stack_depth], si_hi[si->stack_depth - 1]); + } + + return 0; +} + +/* + * Load an element from the array into the first scratchpad and cache the page, + * if possible. + */ +static inline int +xfarray_sort_load_cached( + struct xfarray_sortinfo *si, + xfarray_idx_t idx, + void *ptr) +{ + loff_t idx_pos = xfarray_pos(si->array, idx); + pgoff_t startpage; + pgoff_t endpage; + int error = 0; + + /* + * If this load would split a page, release the cached page, if any, + * and perform a traditional read. + */ + startpage = idx_pos >> PAGE_SHIFT; + endpage = (idx_pos + si->array->obj_size - 1) >> PAGE_SHIFT; + if (startpage != endpage) { + error = xfarray_sort_put_page(si); + if (error) + return error; + + if (xfarray_sort_terminated(si, &error)) + return error; + + return xfile_obj_load(si->array->xfile, ptr, + si->array->obj_size, idx_pos); + } + + /* If the cached page is not the one we want, release it. */ + if (xfile_page_cached(&si->xfpage) && + xfile_page_index(&si->xfpage) != startpage) { + error = xfarray_sort_put_page(si); + if (error) + return error; + } + + /* + * If we don't have a cached page (and we know the load is contained + * in a single page) then grab it. + */ + if (!xfile_page_cached(&si->xfpage)) { + if (xfarray_sort_terminated(si, &error)) + return error; + + error = xfarray_sort_get_page(si, startpage << PAGE_SHIFT, + PAGE_SIZE); + if (error) + return error; + } + + memcpy(ptr, si->page_kaddr + offset_in_page(idx_pos), + si->array->obj_size); + return 0; +} + +/* + * Sort the array elements via quicksort. This implementation incorporates + * four optimizations discussed in Sedgewick: + * + * 1. Use an explicit stack of array indices to store the next array partition + * to sort. This helps us to avoid recursion in the call stack, which is + * particularly expensive in the kernel. + * + * 2. For arrays with records in arbitrary or user-controlled order, choose the + * pivot element using a median-of-nine decision tree. This reduces the + * probability of selecting a bad pivot value which causes worst case + * behavior (i.e. partition sizes of 1). + * + * 3. The smaller of the two sub-partitions is pushed onto the stack to start + * the next level of recursion, and the larger sub-partition replaces the + * current stack frame. This guarantees that we won't need more than + * log2(nr) stack space. + * + * 4. For small sets, load the records into the scratchpad and run heapsort on + * them because that is very fast. In the author's experience, this yields + * a ~10% reduction in runtime. + * + * If a small set is contained entirely within a single xfile memory page, + * map the page directly and run heap sort directly on the xfile page + * instead of using the load/store interface. This halves the runtime. + * + * 5. This optimization is specific to the implementation. When converging lo + * and hi after selecting a pivot, we will try to retain the xfile memory + * page between load calls, which reduces run time by 50%. + */ + +/* + * Due to the use of signed indices, we can only support up to 2^63 records. + * Files can only grow to 2^63 bytes, so this is not much of a limitation. + */ +#define QSORT_MAX_RECS (1ULL << 63) + +int +xfarray_sort( + struct xfarray *array, + xfarray_cmp_fn cmp_fn, + unsigned int flags) +{ + struct xfarray_sortinfo *si; + xfarray_idx_t *si_lo, *si_hi; + void *pivot; + void *scratch = xfarray_scratch(array); + xfarray_idx_t lo, hi; + int error = 0; + + if (array->nr < 2) + return 0; + if (array->nr >= QSORT_MAX_RECS) + return -E2BIG; + + error = xfarray_sortinfo_alloc(array, cmp_fn, flags, &si); + if (error) + return error; + si_lo = xfarray_sortinfo_lo(si); + si_hi = xfarray_sortinfo_hi(si); + pivot = xfarray_sortinfo_pivot(si); + + while (si->stack_depth >= 0) { + lo = si_lo[si->stack_depth]; + hi = si_hi[si->stack_depth]; + + trace_xfarray_qsort(si, lo, hi); + + /* Nothing left in this partition to sort; pop stack. */ + if (lo >= hi) { + si->stack_depth--; + continue; + } + + /* + * If directly mapping the page and sorting can solve our + * problems, we're done. + */ + if (xfarray_want_pagesort(si, lo, hi)) { + error = xfarray_pagesort(si, lo, hi); + if (error) + goto out_free; + si->stack_depth--; + continue; + } + + /* If insertion sort can solve our problems, we're done. */ + if (xfarray_want_isort(si, lo, hi)) { + error = xfarray_isort(si, lo, hi); + if (error) + goto out_free; + si->stack_depth--; + continue; + } + + /* Pick a pivot, move it to a[lo] and stash it. */ + error = xfarray_qsort_pivot(si, lo, hi); + if (error) + goto out_free; + + /* + * Rearrange a[lo..hi] such that everything smaller than the + * pivot is on the left side of the range and everything larger + * than the pivot is on the right side of the range. + */ + while (lo < hi) { + /* + * Decrement hi until it finds an a[hi] less than the + * pivot value. + */ + error = xfarray_sort_load_cached(si, hi, scratch); + if (error) + goto out_free; + while (xfarray_sort_cmp(si, scratch, pivot) >= 0 && + lo < hi) { + hi--; + error = xfarray_sort_load_cached(si, hi, + scratch); + if (error) + goto out_free; + } + error = xfarray_sort_put_page(si); + if (error) + goto out_free; + + if (xfarray_sort_terminated(si, &error)) + goto out_free; + + /* Copy that item (a[hi]) to a[lo]. */ + if (lo < hi) { + error = xfarray_sort_store(si, lo++, scratch); + if (error) + goto out_free; + } + + /* + * Increment lo until it finds an a[lo] greater than + * the pivot value. + */ + error = xfarray_sort_load_cached(si, lo, scratch); + if (error) + goto out_free; + while (xfarray_sort_cmp(si, scratch, pivot) <= 0 && + lo < hi) { + lo++; + error = xfarray_sort_load_cached(si, lo, + scratch); + if (error) + goto out_free; + } + error = xfarray_sort_put_page(si); + if (error) + goto out_free; + + if (xfarray_sort_terminated(si, &error)) + goto out_free; + + /* Copy that item (a[lo]) to a[hi]. */ + if (lo < hi) { + error = xfarray_sort_store(si, hi--, scratch); + if (error) + goto out_free; + } + + if (xfarray_sort_terminated(si, &error)) + goto out_free; + } + + /* + * Put our pivot value in the correct place at a[lo]. All + * values between a[beg[i]] and a[lo - 1] should be less than + * the pivot; and all values between a[lo + 1] and a[end[i]-1] + * should be greater than the pivot. + */ + error = xfarray_sort_store(si, lo, pivot); + if (error) + goto out_free; + + /* Set up the stack frame to process the two partitions. */ + error = xfarray_qsort_push(si, si_lo, si_hi, lo, hi); + if (error) + goto out_free; + + if (xfarray_sort_terminated(si, &error)) + goto out_free; + } + +out_free: + trace_xfarray_sort_stats(si, error); + kvfree(si); + return error; +} diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h new file mode 100644 index 000000000000..4ecac01363d9 --- /dev/null +++ b/fs/xfs/scrub/xfarray.h @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2021-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_XFARRAY_H__ +#define __XFS_SCRUB_XFARRAY_H__ + +/* xfile array index type, along with cursor initialization */ +typedef uint64_t xfarray_idx_t; +#define XFARRAY_CURSOR_INIT ((__force xfarray_idx_t)0) + +/* Iterate each index of an xfile array. */ +#define foreach_xfarray_idx(array, idx) \ + for ((idx) = XFARRAY_CURSOR_INIT; \ + (idx) < xfarray_length(array); \ + (idx)++) + +struct xfarray { + /* Underlying file that backs the array. */ + struct xfile *xfile; + + /* Number of array elements. */ + xfarray_idx_t nr; + + /* Maximum possible array size. */ + xfarray_idx_t max_nr; + + /* Number of unset slots in the array below @nr. */ + uint64_t unset_slots; + + /* Size of an array element. */ + size_t obj_size; + + /* log2 of array element size, if possible. */ + int obj_size_log; +}; + +int xfarray_create(const char *descr, unsigned long long required_capacity, + size_t obj_size, struct xfarray **arrayp); +void xfarray_destroy(struct xfarray *array); +int xfarray_load(struct xfarray *array, xfarray_idx_t idx, void *ptr); +int xfarray_unset(struct xfarray *array, xfarray_idx_t idx); +int xfarray_store(struct xfarray *array, xfarray_idx_t idx, const void *ptr); +int xfarray_store_anywhere(struct xfarray *array, const void *ptr); +bool xfarray_element_is_null(struct xfarray *array, const void *ptr); + +/* Append an element to the array. */ +static inline int xfarray_append(struct xfarray *array, const void *ptr) +{ + return xfarray_store(array, array->nr, ptr); +} + +uint64_t xfarray_length(struct xfarray *array); +int xfarray_load_next(struct xfarray *array, xfarray_idx_t *idx, void *rec); + +/* Declarations for xfile array sort functionality. */ + +typedef cmp_func_t xfarray_cmp_fn; + +/* Perform an in-memory heapsort for small subsets. */ +#define XFARRAY_ISORT_SHIFT (4) +#define XFARRAY_ISORT_NR (1U << XFARRAY_ISORT_SHIFT) + +/* Evalulate this many points to find the qsort pivot. */ +#define XFARRAY_QSORT_PIVOT_NR (9) + +struct xfarray_sortinfo { + struct xfarray *array; + + /* Comparison function for the sort. */ + xfarray_cmp_fn cmp_fn; + + /* Maximum height of the partition stack. */ + uint8_t max_stack_depth; + + /* Current height of the partition stack. */ + int8_t stack_depth; + + /* Maximum stack depth ever used. */ + uint8_t max_stack_used; + + /* XFARRAY_SORT_* flags; see below. */ + unsigned int flags; + + /* Cache a page here for faster access. */ + struct xfile_page xfpage; + void *page_kaddr; + +#ifdef DEBUG + /* Performance statistics. */ + uint64_t loads; + uint64_t stores; + uint64_t compares; + uint64_t heapsorts; +#endif + /* + * Extra bytes are allocated beyond the end of the structure to store + * quicksort information. C does not permit multiple VLAs per struct, + * so we document all of this in a comment. + * + * Pretend that we have a typedef for array records: + * + * typedef char[array->obj_size] xfarray_rec_t; + * + * First comes the quicksort partition stack: + * + * xfarray_idx_t lo[max_stack_depth]; + * xfarray_idx_t hi[max_stack_depth]; + * + * union { + * + * If for a given subset we decide to use an in-memory sort, we use a + * block of scratchpad records here to compare items: + * + * xfarray_rec_t scratch[ISORT_NR]; + * + * Otherwise, we want to partition the records to partition the array. + * We store the chosen pivot record at the start of the scratchpad area + * and use the rest to sample some records to estimate the median. + * The format of the qsort_pivot array enables us to use the kernel + * heapsort function to place the median value in the middle. + * + * struct { + * xfarray_rec_t pivot; + * struct { + * xfarray_rec_t rec; (rounded up to 8 bytes) + * xfarray_idx_t idx; + * } qsort_pivot[QSORT_PIVOT_NR]; + * }; + * } + */ +}; + +/* Sort can be interrupted by a fatal signal. */ +#define XFARRAY_SORT_KILLABLE (1U << 0) + +int xfarray_sort(struct xfarray *array, xfarray_cmp_fn cmp_fn, + unsigned int flags); + +#endif /* __XFS_SCRUB_XFARRAY_H__ */ diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c new file mode 100644 index 000000000000..d98e8e77c684 --- /dev/null +++ b/fs/xfs/scrub/xfile.c @@ -0,0 +1,420 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_format.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/scrub.h" +#include "scrub/trace.h" +#include <linux/shmem_fs.h> + +/* + * Swappable Temporary Memory + * ========================== + * + * Online checking sometimes needs to be able to stage a large amount of data + * in memory. This information might not fit in the available memory and it + * doesn't all need to be accessible at all times. In other words, we want an + * indexed data buffer to store data that can be paged out. + * + * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those + * requirements. Therefore, the xfile mechanism uses an unlinked shmem file to + * store our staging data. This file is not installed in the file descriptor + * table so that user programs cannot access the data, which means that the + * xfile must be freed with xfile_destroy. + * + * xfiles assume that the caller will handle all required concurrency + * management; standard vfs locks (freezer and inode) are not taken. Reads + * and writes are satisfied directly from the page cache. + * + * NOTE: The current shmemfs implementation has a quirk that in-kernel reads + * of a hole cause a page to be mapped into the file. If you are going to + * create a sparse xfile, please be careful about reading from uninitialized + * parts of the file. These pages are !Uptodate and will eventually be + * reclaimed if not written, but in the short term this boosts memory + * consumption. + */ + +/* + * xfiles must not be exposed to userspace and require upper layers to + * coordinate access to the one handle returned by the constructor, so + * establish a separate lock class for xfiles to avoid confusing lockdep. + */ +static struct lock_class_key xfile_i_mutex_key; + +/* + * Create an xfile of the given size. The description will be used in the + * trace output. + */ +int +xfile_create( + const char *description, + loff_t isize, + struct xfile **xfilep) +{ + struct inode *inode; + struct xfile *xf; + int error = -ENOMEM; + + xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS); + if (!xf) + return -ENOMEM; + + xf->file = shmem_file_setup(description, isize, 0); + if (!xf->file) + goto out_xfile; + if (IS_ERR(xf->file)) { + error = PTR_ERR(xf->file); + goto out_xfile; + } + + /* + * We want a large sparse file that we can pread, pwrite, and seek. + * xfile users are responsible for keeping the xfile hidden away from + * all other callers, so we skip timestamp updates and security checks. + * Make the inode only accessible by root, just in case the xfile ever + * escapes. + */ + xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME | + FMODE_LSEEK; + xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME; + inode = file_inode(xf->file); + inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME; + inode->i_mode &= ~0177; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + + lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key); + + trace_xfile_create(xf); + + *xfilep = xf; + return 0; +out_xfile: + kfree(xf); + return error; +} + +/* Close the file and release all resources. */ +void +xfile_destroy( + struct xfile *xf) +{ + struct inode *inode = file_inode(xf->file); + + trace_xfile_destroy(xf); + + lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_key); + fput(xf->file); + kfree(xf); +} + +/* + * Read a memory object directly from the xfile's page cache. Unlike regular + * pread, we return -E2BIG and -EFBIG for reads that are too large or at too + * high an offset, instead of truncating the read. Otherwise, we return + * bytes read or an error code, like regular pread. + */ +ssize_t +xfile_pread( + struct xfile *xf, + void *buf, + size_t count, + loff_t pos) +{ + struct inode *inode = file_inode(xf->file); + struct address_space *mapping = inode->i_mapping; + struct page *page = NULL; + ssize_t read = 0; + unsigned int pflags; + int error = 0; + + if (count > MAX_RW_COUNT) + return -E2BIG; + if (inode->i_sb->s_maxbytes - pos < count) + return -EFBIG; + + trace_xfile_pread(xf, pos, count); + + pflags = memalloc_nofs_save(); + while (count > 0) { + void *p, *kaddr; + unsigned int len; + + len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); + + /* + * In-kernel reads of a shmem file cause it to allocate a page + * if the mapping shows a hole. Therefore, if we hit ENOMEM + * we can continue by zeroing the caller's buffer. + */ + page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT, + __GFP_NOWARN); + if (IS_ERR(page)) { + error = PTR_ERR(page); + if (error != -ENOMEM) + break; + + memset(buf, 0, len); + goto advance; + } + + if (PageUptodate(page)) { + /* + * xfile pages must never be mapped into userspace, so + * we skip the dcache flush. + */ + kaddr = kmap_local_page(page); + p = kaddr + offset_in_page(pos); + memcpy(buf, p, len); + kunmap_local(kaddr); + } else { + memset(buf, 0, len); + } + put_page(page); + +advance: + count -= len; + pos += len; + buf += len; + read += len; + } + memalloc_nofs_restore(pflags); + + if (read > 0) + return read; + return error; +} + +/* + * Write a memory object directly to the xfile's page cache. Unlike regular + * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too + * high an offset, instead of truncating the write. Otherwise, we return + * bytes written or an error code, like regular pwrite. + */ +ssize_t +xfile_pwrite( + struct xfile *xf, + const void *buf, + size_t count, + loff_t pos) +{ + struct inode *inode = file_inode(xf->file); + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *aops = mapping->a_ops; + struct page *page = NULL; + ssize_t written = 0; + unsigned int pflags; + int error = 0; + + if (count > MAX_RW_COUNT) + return -E2BIG; + if (inode->i_sb->s_maxbytes - pos < count) + return -EFBIG; + + trace_xfile_pwrite(xf, pos, count); + + pflags = memalloc_nofs_save(); + while (count > 0) { + void *fsdata = NULL; + void *p, *kaddr; + unsigned int len; + int ret; + + len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); + + /* + * We call write_begin directly here to avoid all the freezer + * protection lock-taking that happens in the normal path. + * shmem doesn't support fs freeze, but lockdep doesn't know + * that and will trip over that. + */ + error = aops->write_begin(NULL, mapping, pos, len, &page, + &fsdata); + if (error) + break; + + /* + * xfile pages must never be mapped into userspace, so we skip + * the dcache flush. If the page is not uptodate, zero it + * before writing data. + */ + kaddr = kmap_local_page(page); + if (!PageUptodate(page)) { + memset(kaddr, 0, PAGE_SIZE); + SetPageUptodate(page); + } + p = kaddr + offset_in_page(pos); + memcpy(p, buf, len); + kunmap_local(kaddr); + + ret = aops->write_end(NULL, mapping, pos, len, len, page, + fsdata); + if (ret < 0) { + error = ret; + break; + } + + written += ret; + if (ret != len) + break; + + count -= ret; + pos += ret; + buf += ret; + } + memalloc_nofs_restore(pflags); + + if (written > 0) + return written; + return error; +} + +/* Find the next written area in the xfile data for a given offset. */ +loff_t +xfile_seek_data( + struct xfile *xf, + loff_t pos) +{ + loff_t ret; + + ret = vfs_llseek(xf->file, pos, SEEK_DATA); + trace_xfile_seek_data(xf, pos, ret); + return ret; +} + +/* Query stat information for an xfile. */ +int +xfile_stat( + struct xfile *xf, + struct xfile_stat *statbuf) +{ + struct kstat ks; + int error; + + error = vfs_getattr_nosec(&xf->file->f_path, &ks, + STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC); + if (error) + return error; + + statbuf->size = ks.size; + statbuf->bytes = ks.blocks << SECTOR_SHIFT; + return 0; +} + +/* + * Grab the (locked) page for a memory object. The object cannot span a page + * boundary. Returns 0 (and a locked page) if successful, -ENOTBLK if we + * cannot grab the page, or the usual negative errno. + */ +int +xfile_get_page( + struct xfile *xf, + loff_t pos, + unsigned int len, + struct xfile_page *xfpage) +{ + struct inode *inode = file_inode(xf->file); + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *aops = mapping->a_ops; + struct page *page = NULL; + void *fsdata = NULL; + loff_t key = round_down(pos, PAGE_SIZE); + unsigned int pflags; + int error; + + if (inode->i_sb->s_maxbytes - pos < len) + return -ENOMEM; + if (len > PAGE_SIZE - offset_in_page(pos)) + return -ENOTBLK; + + trace_xfile_get_page(xf, pos, len); + + pflags = memalloc_nofs_save(); + + /* + * We call write_begin directly here to avoid all the freezer + * protection lock-taking that happens in the normal path. shmem + * doesn't support fs freeze, but lockdep doesn't know that and will + * trip over that. + */ + error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page, + &fsdata); + if (error) + goto out_pflags; + + /* We got the page, so make sure we push out EOF. */ + if (i_size_read(inode) < pos + len) + i_size_write(inode, pos + len); + + /* + * If the page isn't up to date, fill it with zeroes before we hand it + * to the caller and make sure the backing store will hold on to them. + */ + if (!PageUptodate(page)) { + void *kaddr; + + kaddr = kmap_local_page(page); + memset(kaddr, 0, PAGE_SIZE); + kunmap_local(kaddr); + SetPageUptodate(page); + } + + /* + * Mark each page dirty so that the contents are written to some + * backing store when we drop this buffer, and take an extra reference + * to prevent the xfile page from being swapped or removed from the + * page cache by reclaim if the caller unlocks the page. + */ + set_page_dirty(page); + get_page(page); + + xfpage->page = page; + xfpage->fsdata = fsdata; + xfpage->pos = key; +out_pflags: + memalloc_nofs_restore(pflags); + return error; +} + +/* + * Release the (locked) page for a memory object. Returns 0 or a negative + * errno. + */ +int +xfile_put_page( + struct xfile *xf, + struct xfile_page *xfpage) +{ + struct inode *inode = file_inode(xf->file); + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *aops = mapping->a_ops; + unsigned int pflags; + int ret; + + trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE); + + /* Give back the reference that we took in xfile_get_page. */ + put_page(xfpage->page); + + pflags = memalloc_nofs_save(); + ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE, + xfpage->page, xfpage->fsdata); + memalloc_nofs_restore(pflags); + memset(xfpage, 0, sizeof(struct xfile_page)); + + if (ret < 0) + return ret; + if (ret != PAGE_SIZE) + return -EIO; + return 0; +} diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h new file mode 100644 index 000000000000..d56643b0f429 --- /dev/null +++ b/fs/xfs/scrub/xfile.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_XFILE_H__ +#define __XFS_SCRUB_XFILE_H__ + +struct xfile_page { + struct page *page; + void *fsdata; + loff_t pos; +}; + +static inline bool xfile_page_cached(const struct xfile_page *xfpage) +{ + return xfpage->page != NULL; +} + +static inline pgoff_t xfile_page_index(const struct xfile_page *xfpage) +{ + return xfpage->page->index; +} + +struct xfile { + struct file *file; +}; + +int xfile_create(const char *description, loff_t isize, struct xfile **xfilep); +void xfile_destroy(struct xfile *xf); + +ssize_t xfile_pread(struct xfile *xf, void *buf, size_t count, loff_t pos); +ssize_t xfile_pwrite(struct xfile *xf, const void *buf, size_t count, + loff_t pos); + +/* + * Load an object. Since we're treating this file as "memory", any error or + * short IO is treated as a failure to allocate memory. + */ +static inline int +xfile_obj_load(struct xfile *xf, void *buf, size_t count, loff_t pos) +{ + ssize_t ret = xfile_pread(xf, buf, count, pos); + + if (ret < 0 || ret != count) + return -ENOMEM; + return 0; +} + +/* + * Store an object. Since we're treating this file as "memory", any error or + * short IO is treated as a failure to allocate memory. + */ +static inline int +xfile_obj_store(struct xfile *xf, const void *buf, size_t count, loff_t pos) +{ + ssize_t ret = xfile_pwrite(xf, buf, count, pos); + + if (ret < 0 || ret != count) + return -ENOMEM; + return 0; +} + +loff_t xfile_seek_data(struct xfile *xf, loff_t pos); + +struct xfile_stat { + loff_t size; + unsigned long long bytes; +}; + +int xfile_stat(struct xfile *xf, struct xfile_stat *statbuf); + +int xfile_get_page(struct xfile *xf, loff_t offset, unsigned int len, + struct xfile_page *xbuf); +int xfile_put_page(struct xfile *xf, struct xfile_page *xbuf); + +#endif /* __XFS_SCRUB_XFILE_H__ */ diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 791db7d9c849..6b840301817a 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -233,7 +233,7 @@ xfs_acl_set_mode( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); inode->i_mode = mode; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (xfs_has_wsync(mp)) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 451942fb38ec..465d7630bb21 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -478,7 +478,7 @@ xfs_discard_folio( folio, ip->i_ino, pos); /* - * The end of the punch range is always the offset of the the first + * The end of the punch range is always the offset of the first * byte of the next folio. Hence the end offset is only dependent on the * folio itself and not the start offset that is passed in. */ @@ -578,7 +578,7 @@ const struct address_space_operations xfs_address_space_operations = { .read_folio = xfs_vm_read_folio, .readahead = xfs_vm_readahead, .writepages = xfs_vm_writepages, - .dirty_folio = filemap_dirty_folio, + .dirty_folio = iomap_dirty_folio, .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, .bmap = xfs_vm_bmap, diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 5db87b34fb6e..89c7a9f4f930 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -333,7 +333,6 @@ xfs_attr_inactive( int error = 0; mp = dp->i_mount; - ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); xfs_ilock(dp, lock_mode); if (!xfs_inode_has_attr_fork(dp)) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 2788a6f2edcd..36fe2abb16e6 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -547,7 +547,7 @@ xfs_attri_item_recover( struct xfs_inode *ip; struct xfs_da_args *args; struct xfs_trans *tp; - struct xfs_trans_res tres; + struct xfs_trans_res resv; struct xfs_attri_log_format *attrp; struct xfs_attri_log_nameval *nv = attrip->attri_nameval; int error; @@ -618,8 +618,9 @@ xfs_attri_item_recover( goto out; } - xfs_init_attr_trans(args, &tres, &total); - error = xfs_trans_alloc(mp, &tres, total, 0, XFS_TRANS_RESERVE, &tp); + xfs_init_attr_trans(args, &resv, &total); + resv = xlog_recover_resv(&resv); + error = xfs_trans_alloc(mp, &resv, total, 0, XFS_TRANS_RESERVE, &tp); if (error) goto out; diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 7551c3ec4ea5..e736a0844c89 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -490,6 +490,7 @@ xfs_bui_item_recover( struct list_head *capture_list) { struct xfs_bmap_intent fake = { }; + struct xfs_trans_res resv; struct xfs_bui_log_item *buip = BUI_ITEM(lip); struct xfs_trans *tp; struct xfs_inode *ip = NULL; @@ -515,7 +516,8 @@ xfs_bui_item_recover( return error; /* Allocate transaction and do the work. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, + resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); + error = xfs_trans_alloc(mp, &resv, XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); if (error) goto err_rele; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index fbb675563208..fcefab687285 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1644,6 +1644,7 @@ xfs_swap_extents( uint64_t f; int resblks = 0; unsigned int flags = 0; + struct timespec64 ctime; /* * Lock the inodes against other IO, page faults and truncate to @@ -1756,8 +1757,9 @@ xfs_swap_extents( * process that the file was not changed out from * under it. */ - if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || - (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || + ctime = inode_get_ctime(VFS_I(ip)); + if ((sbp->bs_ctime.tv_sec != ctime.tv_sec) || + (sbp->bs_ctime.tv_nsec != ctime.tv_nsec) || (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { error = -EBUSY; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 15d1e5a7c2d3..c1ece4a08ff4 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -481,7 +481,8 @@ _xfs_buf_obj_cmp( * reallocating a busy extent. Skip this buffer and * continue searching for an exact match. */ - ASSERT(bp->b_flags & XBF_STALE); + if (!(map->bm_flags & XBM_LIVESCAN)) + ASSERT(bp->b_flags & XBF_STALE); return 1; } return 0; @@ -559,6 +560,10 @@ xfs_buf_find_lock( * intact here. */ if (bp->b_flags & XBF_STALE) { + if (flags & XBF_LIVESCAN) { + xfs_buf_unlock(bp); + return -ENOENT; + } ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); bp->b_flags &= _XBF_KMEM | _XBF_PAGES; bp->b_ops = NULL; @@ -682,6 +687,8 @@ xfs_buf_get_map( int error; int i; + if (flags & XBF_LIVESCAN) + cmap.bm_flags |= XBM_LIVESCAN; for (i = 0; i < nmaps; i++) cmap.bm_len += map[i].bm_len; @@ -1938,14 +1945,17 @@ void xfs_free_buftarg( struct xfs_buftarg *btp) { + struct block_device *bdev = btp->bt_bdev; + unregister_shrinker(&btp->bt_shrinker); ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); - blkdev_issue_flush(btp->bt_bdev); - invalidate_bdev(btp->bt_bdev); fs_put_dax(btp->bt_daxdev, btp->bt_mount); + /* the main block device is closed by kill_block_super */ + if (bdev != btp->bt_mount->m_super->s_bdev) + blkdev_put(bdev, btp->bt_mount->m_super); kmem_free(btp); } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 549c60942208..df8f47953bb4 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -44,6 +44,11 @@ struct xfs_buf; #define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */ /* flags used only as arguments to access routines */ +/* + * Online fsck is scanning the buffer cache for live buffers. Do not warn + * about length mismatches during lookups and do not return stale buffers. + */ +#define XBF_LIVESCAN (1u << 28) #define XBF_INCORE (1u << 29)/* lookup only, return if found in cache */ #define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */ #define XBF_UNMAPPED (1u << 31)/* do not map the buffer */ @@ -67,6 +72,7 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ /* The following interface flags should never be set */ \ + { XBF_LIVESCAN, "LIVESCAN" }, \ { XBF_INCORE, "INCORE" }, \ { XBF_TRYLOCK, "TRYLOCK" }, \ { XBF_UNMAPPED, "UNMAPPED" } @@ -114,8 +120,15 @@ typedef struct xfs_buftarg { struct xfs_buf_map { xfs_daddr_t bm_bn; /* block number for I/O */ int bm_len; /* size of I/O */ + unsigned int bm_flags; }; +/* + * Online fsck is scanning the buffer cache for live buffers. Do not warn + * about length mismatches during lookups and do not return stale buffers. + */ +#define XBM_LIVESCAN (1U << 0) + #define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \ struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 7f071757f278..ac6ba646624d 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1386,7 +1386,7 @@ xfs_qm_dqiterate( return error; error = iter_fn(dq, type, priv); - id = dq->q_id; + id = dq->q_id + 1; xfs_qm_dqput(dq); } while (error == 0 && id != 0); diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 1064c2342876..f71ea786a6d2 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -146,6 +146,12 @@ xfs_nfs_get_inode( return ERR_PTR(error); } + error = xfs_inode_reload_unlinked(ip); + if (error) { + xfs_irele(ip); + return ERR_PTR(error); + } + if (VFS_I(ip)->i_generation != generation) { xfs_irele(ip); return ERR_PTR(-ESTALE); diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index f1a5ecf099aa..3fa8789820ad 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -660,6 +660,7 @@ xfs_efi_item_recover( struct xfs_log_item *lip, struct list_head *capture_list) { + struct xfs_trans_res resv; struct xfs_efi_log_item *efip = EFI_ITEM(lip); struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_efd_log_item *efdp; @@ -683,7 +684,8 @@ xfs_efi_item_recover( } } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); + error = xfs_trans_alloc(mp, &resv, 0, 0, 0, &tp); if (error) return error; efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 4f502219ae4f..203700278ddb 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1287,11 +1287,11 @@ xfs_file_llseek( static inline vm_fault_t xfs_dax_fault( struct vm_fault *vmf, - enum page_entry_size pe_size, + unsigned int order, bool write_fault, pfn_t *pfn) { - return dax_iomap_fault(vmf, pe_size, pfn, NULL, + return dax_iomap_fault(vmf, order, pfn, NULL, (write_fault && !vmf->cow_page) ? &xfs_dax_write_iomap_ops : &xfs_read_iomap_ops); @@ -1300,7 +1300,7 @@ xfs_dax_fault( static inline vm_fault_t xfs_dax_fault( struct vm_fault *vmf, - enum page_entry_size pe_size, + unsigned int order, bool write_fault, pfn_t *pfn) { @@ -1322,14 +1322,14 @@ xfs_dax_fault( static vm_fault_t __xfs_filemap_fault( struct vm_fault *vmf, - enum page_entry_size pe_size, + unsigned int order, bool write_fault) { struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); vm_fault_t ret; - trace_xfs_filemap_fault(ip, pe_size, write_fault); + trace_xfs_filemap_fault(ip, order, write_fault); if (write_fault) { sb_start_pagefault(inode->i_sb); @@ -1340,9 +1340,9 @@ __xfs_filemap_fault( pfn_t pfn; xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn); + ret = xfs_dax_fault(vmf, order, write_fault, &pfn); if (ret & VM_FAULT_NEEDDSYNC) - ret = dax_finish_sync_fault(vmf, pe_size, pfn); + ret = dax_finish_sync_fault(vmf, order, pfn); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); } else { if (write_fault) { @@ -1373,7 +1373,7 @@ xfs_filemap_fault( struct vm_fault *vmf) { /* DAX can shortcut the normal fault path on write faults! */ - return __xfs_filemap_fault(vmf, PE_SIZE_PTE, + return __xfs_filemap_fault(vmf, 0, IS_DAX(file_inode(vmf->vma->vm_file)) && xfs_is_write_fault(vmf)); } @@ -1381,13 +1381,13 @@ xfs_filemap_fault( static vm_fault_t xfs_filemap_huge_fault( struct vm_fault *vmf, - enum page_entry_size pe_size) + unsigned int order) { if (!IS_DAX(file_inode(vmf->vma->vm_file))) return VM_FAULT_FALLBACK; /* DAX can shortcut the normal fault path on write faults! */ - return __xfs_filemap_fault(vmf, pe_size, + return __xfs_filemap_fault(vmf, order, xfs_is_write_fault(vmf)); } @@ -1395,7 +1395,7 @@ static vm_fault_t xfs_filemap_page_mkwrite( struct vm_fault *vmf) { - return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); + return __xfs_filemap_fault(vmf, 0, true); } /* @@ -1408,7 +1408,7 @@ xfs_filemap_pfn_mkwrite( struct vm_fault *vmf) { - return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); + return __xfs_filemap_fault(vmf, 0, true); } static const struct vm_operations_struct xfs_file_vm_ops = { diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 10403ba9b58f..736e5545f584 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -565,6 +565,19 @@ err: } #endif /* CONFIG_XFS_RT */ +static inline bool +rmap_not_shareable(struct xfs_mount *mp, const struct xfs_rmap_irec *r) +{ + if (!xfs_has_reflink(mp)) + return true; + if (XFS_RMAP_NON_INODE_OWNER(r->rm_owner)) + return true; + if (r->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK | + XFS_RMAP_UNWRITTEN)) + return true; + return false; +} + /* Execute a getfsmap query against the regular data device. */ STATIC int __xfs_getfsmap_datadev( @@ -598,7 +611,6 @@ __xfs_getfsmap_datadev( * low to the fsmap low key and max out the high key to the end * of the AG. */ - info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb); info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); if (error) @@ -608,12 +620,9 @@ __xfs_getfsmap_datadev( /* Adjust the low key if we are continuing from where we left off. */ if (info->low.rm_blockcount == 0) { - /* empty */ - } else if (XFS_RMAP_NON_INODE_OWNER(info->low.rm_owner) || - (info->low.rm_flags & (XFS_RMAP_ATTR_FORK | - XFS_RMAP_BMBT_BLOCK | - XFS_RMAP_UNWRITTEN))) { - info->low.rm_startblock += info->low.rm_blockcount; + /* No previous record from which to continue */ + } else if (rmap_not_shareable(mp, &info->low)) { + /* Last record seen was an unshareable extent */ info->low.rm_owner = 0; info->low.rm_offset = 0; @@ -621,8 +630,10 @@ __xfs_getfsmap_datadev( if (XFS_FSB_TO_DADDR(mp, start_fsb) >= eofs) return 0; } else { + /* Last record seen was a shareable file data extent */ info->low.rm_offset += info->low.rm_blockcount; } + info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb); info->high.rm_startblock = -1U; info->high.rm_owner = ULLONG_MAX; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 453890942d9f..3c210ac83713 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -113,7 +113,7 @@ xfs_inode_alloc( INIT_LIST_HEAD(&ip->i_ioend_list); spin_lock_init(&ip->i_ioend_lock); ip->i_next_unlinked = NULLAGINO; - ip->i_prev_unlinked = NULLAGINO; + ip->i_prev_unlinked = 0; return ip; } @@ -443,7 +443,7 @@ xfs_inodegc_queue_all( int cpu; bool ret = false; - for_each_online_cpu(cpu) { + for_each_cpu(cpu, &mp->m_inodegc_cpumask) { gc = per_cpu_ptr(mp->m_inodegc, cpu); if (!llist_empty(&gc->list)) { mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); @@ -463,7 +463,7 @@ xfs_inodegc_wait_all( int error = 0; flush_workqueue(mp->m_inodegc_wq); - for_each_online_cpu(cpu) { + for_each_cpu(cpu, &mp->m_inodegc_cpumask) { struct xfs_inodegc *gc; gc = per_cpu_ptr(mp->m_inodegc, cpu); @@ -803,44 +803,6 @@ out_error_or_again: } /* - * "Is this a cached inode that's also allocated?" - * - * Look up an inode by number in the given file system. If the inode is - * in cache and isn't in purgatory, return 1 if the inode is allocated - * and 0 if it is not. For all other cases (not in cache, being torn - * down, etc.), return a negative error code. - * - * The caller has to prevent inode allocation and freeing activity, - * presumably by locking the AGI buffer. This is to ensure that an - * inode cannot transition from allocated to freed until the caller is - * ready to allow that. If the inode is in an intermediate state (new, - * reclaimable, or being reclaimed), -EAGAIN will be returned; if the - * inode is not in the cache, -ENOENT will be returned. The caller must - * deal with these scenarios appropriately. - * - * This is a specialized use case for the online scrubber; if you're - * reading this, you probably want xfs_iget. - */ -int -xfs_icache_inode_is_allocated( - struct xfs_mount *mp, - struct xfs_trans *tp, - xfs_ino_t ino, - bool *inuse) -{ - struct xfs_inode *ip; - int error; - - error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip); - if (error) - return error; - - *inuse = !!(VFS_I(ip)->i_mode); - xfs_irele(ip); - return 0; -} - -/* * Grab the inode for reclaim exclusively. * * We have found this inode via a lookup under RCU, so the inode may have @@ -1883,9 +1845,17 @@ xfs_inodegc_worker( struct xfs_inodegc, work); struct llist_node *node = llist_del_all(&gc->list); struct xfs_inode *ip, *n; + struct xfs_mount *mp = gc->mp; unsigned int nofs_flag; - ASSERT(gc->cpu == smp_processor_id()); + /* + * Clear the cpu mask bit and ensure that we have seen the latest + * update of the gc structure associated with this CPU. This matches + * with the release semantics used when setting the cpumask bit in + * xfs_inodegc_queue. + */ + cpumask_clear_cpu(gc->cpu, &mp->m_inodegc_cpumask); + smp_mb__after_atomic(); WRITE_ONCE(gc->items, 0); @@ -1900,7 +1870,7 @@ xfs_inodegc_worker( nofs_flag = memalloc_nofs_save(); ip = llist_entry(node, struct xfs_inode, i_gclist); - trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits)); + trace_xfs_inodegc_worker(mp, READ_ONCE(gc->shrinker_hits)); WRITE_ONCE(gc->shrinker_hits, 0); llist_for_each_entry_safe(ip, n, node, i_gclist) { @@ -2095,6 +2065,7 @@ xfs_inodegc_queue( struct xfs_inodegc *gc; int items; unsigned int shrinker_hits; + unsigned int cpu_nr; unsigned long queue_delay = 1; trace_xfs_inode_set_need_inactive(ip); @@ -2102,18 +2073,28 @@ xfs_inodegc_queue( ip->i_flags |= XFS_NEED_INACTIVE; spin_unlock(&ip->i_flags_lock); - gc = get_cpu_ptr(mp->m_inodegc); + cpu_nr = get_cpu(); + gc = this_cpu_ptr(mp->m_inodegc); llist_add(&ip->i_gclist, &gc->list); items = READ_ONCE(gc->items); WRITE_ONCE(gc->items, items + 1); shrinker_hits = READ_ONCE(gc->shrinker_hits); /* + * Ensure the list add is always seen by anyone who finds the cpumask + * bit set. This effectively gives the cpumask bit set operation + * release ordering semantics. + */ + smp_mb__before_atomic(); + if (!cpumask_test_cpu(cpu_nr, &mp->m_inodegc_cpumask)) + cpumask_test_and_set_cpu(cpu_nr, &mp->m_inodegc_cpumask); + + /* * We queue the work while holding the current CPU so that the work * is scheduled to run on this CPU. */ if (!xfs_is_inodegc_enabled(mp)) { - put_cpu_ptr(gc); + put_cpu(); return; } @@ -2123,7 +2104,7 @@ xfs_inodegc_queue( trace_xfs_inodegc_queue(mp, __return_address); mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work, queue_delay); - put_cpu_ptr(gc); + put_cpu(); if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) { trace_xfs_inodegc_throttle(mp, __return_address); @@ -2132,47 +2113,6 @@ xfs_inodegc_queue( } /* - * Fold the dead CPU inodegc queue into the current CPUs queue. - */ -void -xfs_inodegc_cpu_dead( - struct xfs_mount *mp, - unsigned int dead_cpu) -{ - struct xfs_inodegc *dead_gc, *gc; - struct llist_node *first, *last; - unsigned int count = 0; - - dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu); - cancel_delayed_work_sync(&dead_gc->work); - - if (llist_empty(&dead_gc->list)) - return; - - first = dead_gc->list.first; - last = first; - while (last->next) { - last = last->next; - count++; - } - dead_gc->list.first = NULL; - dead_gc->items = 0; - - /* Add pending work to current CPU */ - gc = get_cpu_ptr(mp->m_inodegc); - llist_add_batch(first, last, &gc->list); - count += READ_ONCE(gc->items); - WRITE_ONCE(gc->items, count); - - if (xfs_is_inodegc_enabled(mp)) { - trace_xfs_inodegc_queue(mp, __return_address); - mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work, - 0); - } - put_cpu_ptr(gc); -} - -/* * We set the inode flag atomically with the radix tree tag. Once we get tag * lookups on the radix tree, this inode flag can go away. * @@ -2233,7 +2173,7 @@ xfs_inodegc_shrinker_count( if (!xfs_is_inodegc_enabled(mp)) return 0; - for_each_online_cpu(cpu) { + for_each_cpu(cpu, &mp->m_inodegc_cpumask) { gc = per_cpu_ptr(mp->m_inodegc, cpu); if (!llist_empty(&gc->list)) return XFS_INODEGC_SHRINKER_COUNT; @@ -2258,7 +2198,7 @@ xfs_inodegc_shrinker_scan( trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address); - for_each_online_cpu(cpu) { + for_each_cpu(cpu, &mp->m_inodegc_cpumask) { gc = per_cpu_ptr(mp->m_inodegc, cpu); if (!llist_empty(&gc->list)) { unsigned int h = READ_ONCE(gc->shrinker_hits); diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 1dcdcb23796e..905944dafbe5 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -71,10 +71,6 @@ void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); void xfs_blockgc_worker(struct work_struct *work); - -int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_ino_t ino, bool *inuse); - void xfs_blockgc_stop(struct xfs_mount *mp); void xfs_blockgc_start(struct xfs_mount *mp); @@ -83,7 +79,6 @@ void xfs_inodegc_push(struct xfs_mount *mp); int xfs_inodegc_flush(struct xfs_mount *mp); void xfs_inodegc_stop(struct xfs_mount *mp); void xfs_inodegc_start(struct xfs_mount *mp); -void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu); int xfs_inodegc_register_shrinker(struct xfs_mount *mp); #endif diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9e62cc500140..f94f7b374041 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -843,10 +843,9 @@ xfs_init_new_inode( ip->i_df.if_nextents = 0; ASSERT(ip->i_nblocks == 0); - tv = current_time(inode); + tv = inode_set_ctime_current(inode); inode->i_mtime = tv; inode->i_atime = tv; - inode->i_ctime = tv; ip->i_extsize = 0; ip->i_diflags = 0; @@ -1643,8 +1642,11 @@ xfs_inode_needs_inactive( if (VFS_I(ip)->i_mode == 0) return false; - /* If this is a read-only mount, don't do this (would generate I/O) */ - if (xfs_is_readonly(mp)) + /* + * If this is a read-only mount, don't do this (would generate I/O) + * unless we're in log recovery and cleaning the iunlinked list. + */ + if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log)) return false; /* If the log isn't running, push inodes straight to reclaim. */ @@ -1704,8 +1706,11 @@ xfs_inactive( mp = ip->i_mount; ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY)); - /* If this is a read-only mount, don't do this (would generate I/O) */ - if (xfs_is_readonly(mp)) + /* + * If this is a read-only mount, don't do this (would generate I/O) + * unless we're in log recovery and cleaning the iunlinked list. + */ + if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log)) goto out; /* Metadata inodes require explicit resource cleanup. */ @@ -1737,9 +1742,13 @@ xfs_inactive( ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) truncate = 1; - error = xfs_qm_dqattach(ip); - if (error) - goto out; + if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) { + xfs_qm_dqdetach(ip); + } else { + error = xfs_qm_dqattach(ip); + if (error) + goto out; + } if (S_ISLNK(VFS_I(ip)->i_mode)) error = xfs_inactive_symlink(ip); @@ -1823,12 +1832,17 @@ xfs_iunlink_lookup( rcu_read_lock(); ip = radix_tree_lookup(&pag->pag_ici_root, agino); + if (!ip) { + /* Caller can handle inode not being in memory. */ + rcu_read_unlock(); + return NULL; + } /* - * Inode not in memory or in RCU freeing limbo should not happen. - * Warn about this and let the caller handle the failure. + * Inode in RCU freeing limbo should not happen. Warn about this and + * let the caller handle the failure. */ - if (WARN_ON_ONCE(!ip || !ip->i_ino)) { + if (WARN_ON_ONCE(!ip->i_ino)) { rcu_read_unlock(); return NULL; } @@ -1837,7 +1851,10 @@ xfs_iunlink_lookup( return ip; } -/* Update the prev pointer of the next agino. */ +/* + * Update the prev pointer of the next agino. Returns -ENOLINK if the inode + * is not in cache. + */ static int xfs_iunlink_update_backref( struct xfs_perag *pag, @@ -1852,7 +1869,8 @@ xfs_iunlink_update_backref( ip = xfs_iunlink_lookup(pag, next_agino); if (!ip) - return -EFSCORRUPTED; + return -ENOLINK; + ip->i_prev_unlinked = prev_agino; return 0; } @@ -1896,6 +1914,64 @@ xfs_iunlink_update_bucket( return 0; } +/* + * Load the inode @next_agino into the cache and set its prev_unlinked pointer + * to @prev_agino. Caller must hold the AGI to synchronize with other changes + * to the unlinked list. + */ +STATIC int +xfs_iunlink_reload_next( + struct xfs_trans *tp, + struct xfs_buf *agibp, + xfs_agino_t prev_agino, + xfs_agino_t next_agino) +{ + struct xfs_perag *pag = agibp->b_pag; + struct xfs_mount *mp = pag->pag_mount; + struct xfs_inode *next_ip = NULL; + xfs_ino_t ino; + int error; + + ASSERT(next_agino != NULLAGINO); + +#ifdef DEBUG + rcu_read_lock(); + next_ip = radix_tree_lookup(&pag->pag_ici_root, next_agino); + ASSERT(next_ip == NULL); + rcu_read_unlock(); +#endif + + xfs_info_ratelimited(mp, + "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.", + next_agino, pag->pag_agno); + + /* + * Use an untrusted lookup just to be cautious in case the AGI has been + * corrupted and now points at a free inode. That shouldn't happen, + * but we'd rather shut down now since we're already running in a weird + * situation. + */ + ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino); + error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip); + if (error) + return error; + + /* If this is not an unlinked inode, something is very wrong. */ + if (VFS_I(next_ip)->i_nlink != 0) { + error = -EFSCORRUPTED; + goto rele; + } + + next_ip->i_prev_unlinked = prev_agino; + trace_xfs_iunlink_reload_next(next_ip); +rele: + ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE)); + if (xfs_is_quotacheck_running(mp) && next_ip) + xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED); + xfs_irele(next_ip); + return error; +} + static int xfs_iunlink_insert_inode( struct xfs_trans *tp, @@ -1927,6 +2003,8 @@ xfs_iunlink_insert_inode( * inode. */ error = xfs_iunlink_update_backref(pag, agino, next_agino); + if (error == -ENOLINK) + error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino); if (error) return error; @@ -1942,6 +2020,7 @@ xfs_iunlink_insert_inode( } /* Point the head of the list to point to this inode. */ + ip->i_prev_unlinked = NULLAGINO; return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); } @@ -2021,6 +2100,9 @@ xfs_iunlink_remove_inode( */ error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked, ip->i_next_unlinked); + if (error == -ENOLINK) + error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked, + ip->i_next_unlinked); if (error) return error; @@ -2041,7 +2123,7 @@ xfs_iunlink_remove_inode( } ip->i_next_unlinked = NULLAGINO; - ip->i_prev_unlinked = NULLAGINO; + ip->i_prev_unlinked = 0; return error; } @@ -3530,3 +3612,103 @@ xfs_iunlock2_io_mmap( if (ip1 != ip2) inode_unlock(VFS_I(ip1)); } + +/* + * Reload the incore inode list for this inode. Caller should ensure that + * the link count cannot change, either by taking ILOCK_SHARED or otherwise + * preventing other threads from executing. + */ +int +xfs_inode_reload_unlinked_bucket( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *agibp; + struct xfs_agi *agi; + struct xfs_perag *pag; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + xfs_agino_t prev_agino, next_agino; + unsigned int bucket; + bool foundit = false; + int error; + + /* Grab the first inode in the list */ + pag = xfs_perag_get(mp, agno); + error = xfs_ialloc_read_agi(pag, tp, &agibp); + xfs_perag_put(pag); + if (error) + return error; + + bucket = agino % XFS_AGI_UNLINKED_BUCKETS; + agi = agibp->b_addr; + + trace_xfs_inode_reload_unlinked_bucket(ip); + + xfs_info_ratelimited(mp, + "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating list recovery.", + agino, agno); + + prev_agino = NULLAGINO; + next_agino = be32_to_cpu(agi->agi_unlinked[bucket]); + while (next_agino != NULLAGINO) { + struct xfs_inode *next_ip = NULL; + + if (next_agino == agino) { + /* Found this inode, set its backlink. */ + next_ip = ip; + next_ip->i_prev_unlinked = prev_agino; + foundit = true; + } + if (!next_ip) { + /* Inode already in memory. */ + next_ip = xfs_iunlink_lookup(pag, next_agino); + } + if (!next_ip) { + /* Inode not in memory, reload. */ + error = xfs_iunlink_reload_next(tp, agibp, prev_agino, + next_agino); + if (error) + break; + + next_ip = xfs_iunlink_lookup(pag, next_agino); + } + if (!next_ip) { + /* No incore inode at all? We reloaded it... */ + ASSERT(next_ip != NULL); + error = -EFSCORRUPTED; + break; + } + + prev_agino = next_agino; + next_agino = next_ip->i_next_unlinked; + } + + xfs_trans_brelse(tp, agibp); + /* Should have found this inode somewhere in the iunlinked bucket. */ + if (!error && !foundit) + error = -EFSCORRUPTED; + return error; +} + +/* Decide if this inode is missing its unlinked list and reload it. */ +int +xfs_inode_reload_unlinked( + struct xfs_inode *ip) +{ + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc_empty(ip->i_mount, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_inode_unlinked_incomplete(ip)) + error = xfs_inode_reload_unlinked_bucket(tp, ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + xfs_trans_cancel(tp); + + return error; +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 7547caf2f2ab..0c5bdb91152e 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -68,8 +68,21 @@ typedef struct xfs_inode { uint64_t i_diflags2; /* XFS_DIFLAG2_... */ struct timespec64 i_crtime; /* time created */ - /* unlinked list pointers */ + /* + * Unlinked list pointers. These point to the next and previous inodes + * in the AGI unlinked bucket list, respectively. These fields can + * only be updated with the AGI locked. + * + * i_next_unlinked caches di_next_unlinked. + */ xfs_agino_t i_next_unlinked; + + /* + * If the inode is not on an unlinked list, this field is zero. If the + * inode is the first element in an unlinked list, this field is + * NULLAGINO. Otherwise, i_prev_unlinked points to the previous inode + * in the unlinked list. + */ xfs_agino_t i_prev_unlinked; /* VFS inode */ @@ -81,6 +94,11 @@ typedef struct xfs_inode { struct list_head i_ioend_list; } xfs_inode_t; +static inline bool xfs_inode_on_unlinked_list(const struct xfs_inode *ip) +{ + return ip->i_prev_unlinked != 0; +} + static inline bool xfs_inode_has_attr_fork(struct xfs_inode *ip) { return ip->i_forkoff > 0; @@ -326,6 +344,9 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) */ #define XFS_INACTIVATING (1 << 13) +/* Quotacheck is running but inode has not been added to quota counts. */ +#define XFS_IQUOTAUNCHECKED (1 << 14) + /* All inode state flags related to inode reclaim. */ #define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \ XFS_IRECLAIM | \ @@ -340,7 +361,7 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) #define XFS_IRECLAIM_RESET_FLAGS \ (XFS_IRECLAIMABLE | XFS_IRECLAIM | \ XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \ - XFS_INACTIVATING) + XFS_INACTIVATING | XFS_IQUOTAUNCHECKED) /* * Flags for inode locking. @@ -575,4 +596,13 @@ void xfs_end_io(struct work_struct *work); int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); +static inline bool +xfs_inode_unlinked_incomplete( + struct xfs_inode *ip) +{ + return VFS_I(ip)->i_nlink == 0 && !xfs_inode_on_unlinked_list(ip); +} +int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip); +int xfs_inode_reload_unlinked(struct xfs_inode *ip); + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 91c847a84e10..127b2410eb20 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -528,7 +528,7 @@ xfs_inode_to_log_dinode( memset(to->di_pad3, 0, sizeof(to->di_pad3)); to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime); to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime); - to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode->i_ctime); + to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode_get_ctime(inode)); to->di_nlink = inode->i_nlink; to->di_gen = inode->i_generation; to->di_mode = inode->i_mode; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 24718adb3c16..1c1e6171209d 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -574,7 +574,7 @@ xfs_vn_getattr( stat->ino = ip->i_ino; stat->atime = inode->i_atime; stat->mtime = inode->i_mtime; - stat->ctime = inode->i_ctime; + stat->ctime = inode_get_ctime(inode); stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks); if (xfs_has_v3inodes(mp)) { @@ -1029,7 +1029,6 @@ xfs_vn_setattr( STATIC int xfs_vn_update_time( struct inode *inode, - struct timespec64 *now, int flags) { struct xfs_inode *ip = XFS_I(inode); @@ -1037,13 +1036,16 @@ xfs_vn_update_time( int log_flags = XFS_ILOG_TIMESTAMP; struct xfs_trans *tp; int error; + struct timespec64 now; trace_xfs_update_time(ip); if (inode->i_sb->s_flags & SB_LAZYTIME) { if (!((flags & S_VERSION) && - inode_maybe_inc_iversion(inode, false))) - return generic_update_time(inode, now, flags); + inode_maybe_inc_iversion(inode, false))) { + generic_update_time(inode, flags); + return 0; + } /* Capture the iversion update that just occurred */ log_flags |= XFS_ILOG_CORE; @@ -1054,12 +1056,15 @@ xfs_vn_update_time( return error; xfs_ilock(ip, XFS_ILOCK_EXCL); - if (flags & S_CTIME) - inode->i_ctime = *now; + if (flags & (S_CTIME|S_MTIME)) + now = inode_set_ctime_current(inode); + else + now = current_time(inode); + if (flags & S_MTIME) - inode->i_mtime = *now; + inode->i_mtime = now; if (flags & S_ATIME) - inode->i_atime = *now; + inode->i_atime = now; xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, log_flags); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f225413a993c..ccf0c4ff4490 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -80,6 +80,15 @@ xfs_bulkstat_one_int( if (error) goto out; + if (xfs_inode_unlinked_incomplete(ip)) { + error = xfs_inode_reload_unlinked_bucket(tp, ip); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + xfs_irele(ip); + return error; + } + } + ASSERT(ip != NULL); ASSERT(ip->i_imap.im_blkno != 0); inode = VFS_I(ip); @@ -100,8 +109,8 @@ xfs_bulkstat_one_int( buf->bs_atime_nsec = inode->i_atime.tv_nsec; buf->bs_mtime = inode->i_mtime.tv_sec; buf->bs_mtime_nsec = inode->i_mtime.tv_nsec; - buf->bs_ctime = inode->i_ctime.tv_sec; - buf->bs_ctime_nsec = inode->i_ctime.tv_nsec; + buf->bs_ctime = inode_get_ctime(inode).tv_sec; + buf->bs_ctime_nsec = inode_get_ctime(inode).tv_nsec; buf->bs_gen = inode->i_generation; buf->bs_mode = inode->i_mode; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 74dcb05069e8..e9d317a3dafe 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -63,6 +63,7 @@ typedef __u32 xfs_nlink_t; #include <linux/rhashtable.h> #include <linux/xattr.h> #include <linux/mnt_idmapping.h> +#include <linux/debugfs.h> #include <asm/page.h> #include <asm/div64.h> diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 79004d193e54..51c100c86177 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -715,15 +715,7 @@ xfs_log_mount( * just worked. */ if (!xfs_has_norecovery(mp)) { - /* - * log recovery ignores readonly state and so we need to clear - * mount-based read only state so it can write to disk. - */ - bool readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, - &mp->m_opstate); error = xlog_recover(log); - if (readonly) - set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); if (error) { xfs_warn(mp, "log mount/recovery failed: error %d", error); @@ -772,7 +764,6 @@ xfs_log_mount_finish( struct xfs_mount *mp) { struct xlog *log = mp->m_log; - bool readonly; int error = 0; if (xfs_has_norecovery(mp)) { @@ -781,12 +772,6 @@ xfs_log_mount_finish( } /* - * log recovery ignores readonly state and so we need to clear - * mount-based read only state so it can write to disk. - */ - readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); - - /* * During the second phase of log recovery, we need iget and * iput to behave like they do for an active filesystem. * xfs_fs_drop_inode needs to be able to prevent the deletion @@ -835,8 +820,6 @@ xfs_log_mount_finish( xfs_buftarg_drain(mp->m_ddev_targp); clear_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate); - if (readonly) - set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); /* Make sure the log is dead if we're returning failure. */ ASSERT(!error || xlog_is_shutdown(log)); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index eccbfb99e894..ebc70aaa299c 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -124,7 +124,7 @@ xlog_cil_push_pcp_aggregate( struct xlog_cil_pcp *cilpcp; int cpu; - for_each_online_cpu(cpu) { + for_each_cpu(cpu, &ctx->cil_pcpmask) { cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); ctx->ticket->t_curr_res += cilpcp->space_reserved; @@ -165,7 +165,13 @@ xlog_cil_insert_pcp_aggregate( if (!test_and_clear_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags)) return; - for_each_online_cpu(cpu) { + /* + * We can race with other cpus setting cil_pcpmask. However, we've + * atomically cleared PCP_SPACE which forces other threads to add to + * the global space used count. cil_pcpmask is a superset of cilpcp + * structures that could have a nonzero space_used. + */ + for_each_cpu(cpu, &ctx->cil_pcpmask) { int old, prev; cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); @@ -554,6 +560,7 @@ xlog_cil_insert_items( int iovhdr_res = 0, split_res = 0, ctx_res = 0; int space_used; int order; + unsigned int cpu_nr; struct xlog_cil_pcp *cilpcp; ASSERT(tp); @@ -577,7 +584,12 @@ xlog_cil_insert_items( * can't be scheduled away between split sample/update operations that * are done without outside locking to serialise them. */ - cilpcp = get_cpu_ptr(cil->xc_pcp); + cpu_nr = get_cpu(); + cilpcp = this_cpu_ptr(cil->xc_pcp); + + /* Tell the future push that there was work added by this CPU. */ + if (!cpumask_test_cpu(cpu_nr, &ctx->cil_pcpmask)) + cpumask_test_and_set_cpu(cpu_nr, &ctx->cil_pcpmask); /* * We need to take the CIL checkpoint unit reservation on the first @@ -663,7 +675,7 @@ xlog_cil_insert_items( continue; list_add_tail(&lip->li_cil, &cilpcp->log_items); } - put_cpu_ptr(cilpcp); + put_cpu(); /* * If we've overrun the reservation, dump the tx details before we move @@ -1791,38 +1803,6 @@ out_shutdown: } /* - * Move dead percpu state to the relevant CIL context structures. - * - * We have to lock the CIL context here to ensure that nothing is modifying - * the percpu state, either addition or removal. Both of these are done under - * the CIL context lock, so grabbing that exclusively here will ensure we can - * safely drain the cilpcp for the CPU that is dying. - */ -void -xlog_cil_pcp_dead( - struct xlog *log, - unsigned int cpu) -{ - struct xfs_cil *cil = log->l_cilp; - struct xlog_cil_pcp *cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); - struct xfs_cil_ctx *ctx; - - down_write(&cil->xc_ctx_lock); - ctx = cil->xc_ctx; - if (ctx->ticket) - ctx->ticket->t_curr_res += cilpcp->space_reserved; - cilpcp->space_reserved = 0; - - if (!list_empty(&cilpcp->log_items)) - list_splice_init(&cilpcp->log_items, &ctx->log_items); - if (!list_empty(&cilpcp->busy_extents)) - list_splice_init(&cilpcp->busy_extents, &ctx->busy_extents); - atomic_add(cilpcp->space_used, &ctx->space_used); - cilpcp->space_used = 0; - up_write(&cil->xc_ctx_lock); -} - -/* * Perform initial CIL structure initialisation. */ int diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 1bd2963e8fbd..af87648331d5 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -231,6 +231,12 @@ struct xfs_cil_ctx { struct work_struct discard_endio_work; struct work_struct push_work; atomic_t order_id; + + /* + * CPUs that could have added items to the percpu CIL data. Access is + * coordinated with xc_ctx_lock. + */ + struct cpumask cil_pcpmask; }; /* @@ -278,9 +284,6 @@ struct xfs_cil { wait_queue_head_t xc_push_wait; /* background push throttle */ void __percpu *xc_pcp; /* percpu CIL structures */ -#ifdef CONFIG_HOTPLUG_CPU - struct list_head xc_pcp_list; -#endif } ____cacheline_aligned_in_smp; /* xc_flags bit values */ @@ -705,9 +708,4 @@ xlog_kvmalloc( return p; } -/* - * CIL CPU dead notifier - */ -void xlog_cil_pcp_dead(struct xlog *log, unsigned int cpu); - #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 82c81d20459d..13b94d2e605b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -329,7 +329,7 @@ xlog_find_verify_cycle( * try a smaller size. We need to be able to read at least * a log sector, or we're out of luck. */ - bufblks = 1 << ffs(nbblks); + bufblks = roundup_pow_of_two(nbblks); while (bufblks > log->l_logBBsize) bufblks >>= 1; while (!(buffer = xlog_alloc_buffer(log, bufblks))) { @@ -1528,7 +1528,7 @@ xlog_write_log_records( * a smaller size. We need to be able to write at least a * log sector, or we're out of luck. */ - bufblks = 1 << ffs(blocks); + bufblks = roundup_pow_of_two(blocks); while (bufblks > log->l_logBBsize) bufblks >>= 1; while (!(buffer = xlog_alloc_buffer(log, bufblks))) { diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index fb87ffb48f7f..0a0fd19573d8 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -34,6 +34,7 @@ #include "xfs_health.h" #include "xfs_trace.h" #include "xfs_ag.h" +#include "scrub/stats.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; @@ -716,9 +717,11 @@ xfs_mountfs( if (error) goto out_remove_sysfs; + xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs); + error = xfs_error_sysfs_init(mp); if (error) - goto out_del_stats; + goto out_remove_scrub_stats; error = xfs_errortag_init(mp); if (error) @@ -1033,7 +1036,8 @@ xfs_mountfs( xfs_errortag_del(mp); out_remove_error_sysfs: xfs_error_sysfs_del(mp); - out_del_stats: + out_remove_scrub_stats: + xchk_stats_unregister(mp->m_scrub_stats); xfs_sysfs_del(&mp->m_stats.xs_kobj); out_remove_sysfs: xfs_sysfs_del(&mp->m_kobj); @@ -1105,6 +1109,7 @@ xfs_unmountfs( xfs_errortag_del(mp); xfs_error_sysfs_del(mp); + xchk_stats_unregister(mp->m_scrub_stats); xfs_sysfs_del(&mp->m_stats.xs_kobj); xfs_sysfs_del(&mp->m_kobj); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index e2866e7fa60c..d19cca099bc3 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -60,6 +60,7 @@ struct xfs_error_cfg { * Per-cpu deferred inode inactivation GC lists. */ struct xfs_inodegc { + struct xfs_mount *mp; struct llist_head list; struct delayed_work work; int error; @@ -67,9 +68,7 @@ struct xfs_inodegc { /* approximate count of inodes in the list */ unsigned int items; unsigned int shrinker_hits; -#if defined(DEBUG) || defined(XFS_WARN) unsigned int cpu; -#endif }; /* @@ -98,7 +97,6 @@ typedef struct xfs_mount { xfs_buftarg_t *m_ddev_targp; /* saves taking the address */ xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ - struct list_head m_mount_list; /* global mount list */ void __percpu *m_inodegc; /* percpu inodegc structures */ /* @@ -208,11 +206,15 @@ typedef struct xfs_mount { uint64_t m_resblks_avail;/* available reserved blocks */ uint64_t m_resblks_save; /* reserved blks @ remount,ro */ struct delayed_work m_reclaim_work; /* background inode reclaim */ + struct dentry *m_debugfs; /* debugfs parent */ struct xfs_kobj m_kobj; struct xfs_kobj m_error_kobj; struct xfs_kobj m_error_meta_kobj; struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX]; struct xstats m_stats; /* per-fs stats */ +#ifdef CONFIG_XFS_ONLINE_SCRUB_STATS + struct xchk_stats *m_scrub_stats; +#endif xfs_agnumber_t m_agfrotor; /* last ag where space found */ atomic_t m_agirotor; /* last ag dir inode alloced */ @@ -245,6 +247,9 @@ typedef struct xfs_mount { unsigned int *m_errortag; struct xfs_kobj m_errortag_kobj; #endif + + /* cpus that have inodes queued for inactivation */ + struct cpumask m_inodegc_cpumask; } xfs_mount_t; #define M_IGEO(mp) (&(mp)->m_ino_geo) @@ -400,6 +405,8 @@ __XFS_HAS_FEAT(nouuid, NOUUID) #define XFS_OPSTATE_WARNED_SHRINK 8 /* Kernel has logged a warning about logged xattr updates being used. */ #define XFS_OPSTATE_WARNED_LARP 9 +/* Mount time quotacheck is running */ +#define XFS_OPSTATE_QUOTACHECK_RUNNING 10 #define __XFS_IS_OPSTATE(name, NAME) \ static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ @@ -422,6 +429,11 @@ __XFS_IS_OPSTATE(inode32, INODE32) __XFS_IS_OPSTATE(readonly, READONLY) __XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED) __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) +#ifdef CONFIG_XFS_QUOTA +__XFS_IS_OPSTATE(quotacheck_running, QUOTACHECK_RUNNING) +#else +# define xfs_is_quotacheck_running(mp) (false) +#endif static inline bool xfs_should_warn(struct xfs_mount *mp, long nr) @@ -439,7 +451,8 @@ xfs_should_warn(struct xfs_mount *mp, long nr) { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \ { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \ { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \ - { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" } + { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \ + { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" } /* * Max and min values for mount-option defined I/O diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 6abcc34fafd8..7256090c3895 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1160,6 +1160,10 @@ xfs_qm_dqusage_adjust( if (error) return error; + error = xfs_inode_reload_unlinked(ip); + if (error) + goto error0; + ASSERT(ip->i_delayed_blks == 0); if (XFS_IS_REALTIME_INODE(ip)) { @@ -1173,6 +1177,7 @@ xfs_qm_dqusage_adjust( } nblks = (xfs_qcnt_t)ip->i_nblocks - rtblks; + xfs_iflags_clear(ip, XFS_IQUOTAUNCHECKED); /* * Add the (disk blocks and inode) resources occupied by this @@ -1319,8 +1324,10 @@ xfs_qm_quotacheck( flags |= XFS_PQUOTA_CHKD; } + xfs_set_quotacheck_running(mp); error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true, NULL); + xfs_clear_quotacheck_running(mp); /* * On error, the inode walk may have partially populated the dquot diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index edd8587658d5..2d4444d61e98 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -477,6 +477,7 @@ xfs_cui_item_recover( struct xfs_log_item *lip, struct list_head *capture_list) { + struct xfs_trans_res resv; struct xfs_cui_log_item *cuip = CUI_ITEM(lip); struct xfs_cud_log_item *cudp; struct xfs_trans *tp; @@ -514,8 +515,9 @@ xfs_cui_item_recover( * doesn't fit. We need to reserve enough blocks to handle a * full btree split on either end of the refcount range. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, - mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp); + resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); + error = xfs_trans_alloc(mp, &resv, mp->m_refc_maxlevels * 2, 0, + XFS_TRANS_RESERVE, &tp); if (error) return error; diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 520c7ebdfed8..0e0e747028da 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -507,6 +507,7 @@ xfs_rui_item_recover( struct xfs_log_item *lip, struct list_head *capture_list) { + struct xfs_trans_res resv; struct xfs_rui_log_item *ruip = RUI_ITEM(lip); struct xfs_rud_log_item *rudp; struct xfs_trans *tp; @@ -530,8 +531,9 @@ xfs_rui_item_recover( } } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, - mp->m_rmap_maxlevels, 0, XFS_TRANS_RESERVE, &tp); + resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); + error = xfs_trans_alloc(mp, &resv, mp->m_rmap_maxlevels, 0, + XFS_TRANS_RESERVE, &tp); if (error) return error; rudp = xfs_trans_get_rud(tp, ruip); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 818510243130..819a3568b28f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -42,6 +42,7 @@ #include "xfs_xattr.h" #include "xfs_iunlink_item.h" #include "xfs_dahash_test.h" +#include "scrub/stats.h" #include <linux/magic.h> #include <linux/fs_context.h> @@ -49,33 +50,12 @@ static const struct super_operations xfs_super_operations; +static struct dentry *xfs_debugfs; /* top-level xfs debugfs dir */ static struct kset *xfs_kset; /* top-level xfs sysfs dir */ #ifdef DEBUG static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #endif -#ifdef CONFIG_HOTPLUG_CPU -static LIST_HEAD(xfs_mount_list); -static DEFINE_SPINLOCK(xfs_mount_list_lock); - -static inline void xfs_mount_list_add(struct xfs_mount *mp) -{ - spin_lock(&xfs_mount_list_lock); - list_add(&mp->m_mount_list, &xfs_mount_list); - spin_unlock(&xfs_mount_list_lock); -} - -static inline void xfs_mount_list_del(struct xfs_mount *mp) -{ - spin_lock(&xfs_mount_list_lock); - list_del(&mp->m_mount_list); - spin_unlock(&xfs_mount_list_lock); -} -#else /* !CONFIG_HOTPLUG_CPU */ -static inline void xfs_mount_list_add(struct xfs_mount *mp) {} -static inline void xfs_mount_list_del(struct xfs_mount *mp) {} -#endif - enum xfs_dax_mode { XFS_DAX_INODE = 0, XFS_DAX_ALWAYS = 1, @@ -377,17 +357,6 @@ disable_dax: return 0; } -static void -xfs_bdev_mark_dead( - struct block_device *bdev) -{ - xfs_force_shutdown(bdev->bd_holder, SHUTDOWN_DEVICE_REMOVED); -} - -static const struct blk_holder_ops xfs_holder_ops = { - .mark_dead = xfs_bdev_mark_dead, -}; - STATIC int xfs_blkdev_get( xfs_mount_t *mp, @@ -396,8 +365,8 @@ xfs_blkdev_get( { int error = 0; - *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, mp, - &xfs_holder_ops); + *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, + mp->m_super, &fs_holder_ops); if (IS_ERR(*bdevp)) { error = PTR_ERR(*bdevp); xfs_warn(mp, "Invalid device [%s], error=%d", name, error); @@ -407,31 +376,45 @@ xfs_blkdev_get( } STATIC void -xfs_blkdev_put( - struct xfs_mount *mp, - struct block_device *bdev) -{ - if (bdev) - blkdev_put(bdev, mp); -} - -STATIC void -xfs_close_devices( +xfs_shutdown_devices( struct xfs_mount *mp) { + /* + * Udev is triggered whenever anyone closes a block device or unmounts + * a file systemm on a block device. + * The default udev rules invoke blkid to read the fs super and create + * symlinks to the bdev under /dev/disk. For this, it uses buffered + * reads through the page cache. + * + * xfs_db also uses buffered reads to examine metadata. There is no + * coordination between xfs_db and udev, which means that they can run + * concurrently. Note there is no coordination between the kernel and + * blkid either. + * + * On a system with 64k pages, the page cache can cache the superblock + * and the root inode (and hence the root directory) with the same 64k + * page. If udev spawns blkid after the mkfs and the system is busy + * enough that it is still running when xfs_db starts up, they'll both + * read from the same page in the pagecache. + * + * The unmount writes updated inode metadata to disk directly. The XFS + * buffer cache does not use the bdev pagecache, so it needs to + * invalidate that pagecache on unmount. If the above scenario occurs, + * the pagecache no longer reflects what's on disk, xfs_db reads the + * stale metadata, and fails to find /a. Most of the time this succeeds + * because closing a bdev invalidates the page cache, but when processes + * race, everyone loses. + */ if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { - struct block_device *logdev = mp->m_logdev_targp->bt_bdev; - - xfs_free_buftarg(mp->m_logdev_targp); - xfs_blkdev_put(mp, logdev); + blkdev_issue_flush(mp->m_logdev_targp->bt_bdev); + invalidate_bdev(mp->m_logdev_targp->bt_bdev); } if (mp->m_rtdev_targp) { - struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; - - xfs_free_buftarg(mp->m_rtdev_targp); - xfs_blkdev_put(mp, rtdev); + blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); + invalidate_bdev(mp->m_rtdev_targp->bt_bdev); } - xfs_free_buftarg(mp->m_ddev_targp); + blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); + invalidate_bdev(mp->m_ddev_targp->bt_bdev); } /* @@ -448,17 +431,24 @@ STATIC int xfs_open_devices( struct xfs_mount *mp) { - struct block_device *ddev = mp->m_super->s_bdev; + struct super_block *sb = mp->m_super; + struct block_device *ddev = sb->s_bdev; struct block_device *logdev = NULL, *rtdev = NULL; int error; /* + * blkdev_put() can't be called under s_umount, see the comment + * in get_tree_bdev() for more details + */ + up_write(&sb->s_umount); + + /* * Open real time and log devices - order is important. */ if (mp->m_logname) { error = xfs_blkdev_get(mp, mp->m_logname, &logdev); if (error) - return error; + goto out_relock; } if (mp->m_rtname) { @@ -496,7 +486,10 @@ xfs_open_devices( mp->m_logdev_targp = mp->m_ddev_targp; } - return 0; + error = 0; +out_relock: + down_write(&sb->s_umount); + return error; out_free_rtdev_targ: if (mp->m_rtdev_targp) @@ -504,11 +497,12 @@ xfs_open_devices( out_free_ddev_targ: xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: - xfs_blkdev_put(mp, rtdev); + if (rtdev) + blkdev_put(rtdev, sb); out_close_logdev: if (logdev && logdev != ddev) - xfs_blkdev_put(mp, logdev); - return error; + blkdev_put(logdev, sb); + goto out_relock; } /* @@ -758,6 +752,18 @@ static void xfs_mount_free( struct xfs_mount *mp) { + /* + * Free the buftargs here because blkdev_put needs to be called outside + * of sb->s_umount, which is held around the call to ->put_super. + */ + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) + xfs_free_buftarg(mp->m_logdev_targp); + if (mp->m_rtdev_targp) + xfs_free_buftarg(mp->m_rtdev_targp); + if (mp->m_ddev_targp) + xfs_free_buftarg(mp->m_ddev_targp); + + debugfs_remove(mp->m_debugfs); kfree(mp->m_rtname); kfree(mp->m_logname); kmem_free(mp); @@ -1107,9 +1113,8 @@ xfs_inodegc_init_percpu( for_each_possible_cpu(cpu) { gc = per_cpu_ptr(mp->m_inodegc, cpu); -#if defined(DEBUG) || defined(XFS_WARN) gc->cpu = cpu; -#endif + gc->mp = mp; init_llist_head(&gc->list); gc->items = 0; gc->error = 0; @@ -1133,24 +1138,17 @@ xfs_fs_put_super( { struct xfs_mount *mp = XFS_M(sb); - /* if ->fill_super failed, we have no mount to tear down */ - if (!sb->s_fs_info) - return; - xfs_notice(mp, "Unmounting Filesystem %pU", &mp->m_sb.sb_uuid); xfs_filestream_unmount(mp); xfs_unmountfs(mp); xfs_freesb(mp); + xchk_mount_stats_free(mp); free_percpu(mp->m_stats.xs_stats); - xfs_mount_list_del(mp); xfs_inodegc_free_percpu(mp); xfs_destroy_percpu_counters(mp); xfs_destroy_mount_workqueues(mp); - xfs_close_devices(mp); - - sb->s_fs_info = NULL; - xfs_mount_free(mp); + xfs_shutdown_devices(mp); } static long @@ -1479,6 +1477,21 @@ xfs_fs_validate_params( return 0; } +struct dentry * +xfs_debugfs_mkdir( + const char *name, + struct dentry *parent) +{ + struct dentry *child; + + /* Apparently we're expected to ignore error returns?? */ + child = debugfs_create_dir(name, parent); + if (IS_ERR(child)) + return NULL; + + return child; +} + static int xfs_fs_fill_super( struct super_block *sb, @@ -1492,7 +1505,7 @@ xfs_fs_fill_super( error = xfs_fs_validate_params(mp); if (error) - goto out_free_names; + return error; sb_min_blocksize(sb, BBSIZE); sb->s_xattr = xfs_xattr_handlers; @@ -1519,11 +1532,18 @@ xfs_fs_fill_super( error = xfs_open_devices(mp); if (error) - goto out_free_names; + return error; + + if (xfs_debugfs) { + mp->m_debugfs = xfs_debugfs_mkdir(mp->m_super->s_id, + xfs_debugfs); + } else { + mp->m_debugfs = NULL; + } error = xfs_init_mount_workqueues(mp); if (error) - goto out_close_devices; + goto out_shutdown_devices; error = xfs_init_percpu_counters(mp); if (error) @@ -1533,13 +1553,6 @@ xfs_fs_fill_super( if (error) goto out_destroy_counters; - /* - * All percpu data structures requiring cleanup when a cpu goes offline - * must be allocated before adding this @mp to the cpu-dead handler's - * mount list. - */ - xfs_mount_list_add(mp); - /* Allocate stats memory before we do operations that might use it */ mp->m_stats.xs_stats = alloc_percpu(struct xfsstats); if (!mp->m_stats.xs_stats) { @@ -1547,10 +1560,14 @@ xfs_fs_fill_super( goto out_destroy_inodegc; } - error = xfs_readsb(mp, flags); + error = xchk_mount_stats_alloc(mp); if (error) goto out_free_stats; + error = xfs_readsb(mp, flags); + if (error) + goto out_free_scrub_stats; + error = xfs_finish_flags(mp); if (error) goto out_free_sb; @@ -1728,20 +1745,18 @@ xfs_fs_fill_super( xfs_filestream_unmount(mp); out_free_sb: xfs_freesb(mp); + out_free_scrub_stats: + xchk_mount_stats_free(mp); out_free_stats: free_percpu(mp->m_stats.xs_stats); out_destroy_inodegc: - xfs_mount_list_del(mp); xfs_inodegc_free_percpu(mp); out_destroy_counters: xfs_destroy_percpu_counters(mp); out_destroy_workqueues: xfs_destroy_mount_workqueues(mp); - out_close_devices: - xfs_close_devices(mp); - out_free_names: - sb->s_fs_info = NULL; - xfs_mount_free(mp); + out_shutdown_devices: + xfs_shutdown_devices(mp); return error; out_unmount: @@ -1934,7 +1949,8 @@ xfs_fs_reconfigure( return 0; } -static void xfs_fs_free( +static void +xfs_fs_free( struct fs_context *fc) { struct xfs_mount *mp = fc->s_fs_info; @@ -2003,12 +2019,20 @@ static int xfs_init_fs_context( return 0; } +static void +xfs_kill_sb( + struct super_block *sb) +{ + kill_block_super(sb); + xfs_mount_free(XFS_M(sb)); +} + static struct file_system_type xfs_fs_type = { .owner = THIS_MODULE, .name = "xfs", .init_fs_context = xfs_init_fs_context, .parameters = xfs_fs_parameters, - .kill_sb = kill_block_super, + .kill_sb = xfs_kill_sb, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("xfs"); @@ -2270,49 +2294,6 @@ xfs_destroy_workqueues(void) destroy_workqueue(xfs_alloc_wq); } -#ifdef CONFIG_HOTPLUG_CPU -static int -xfs_cpu_dead( - unsigned int cpu) -{ - struct xfs_mount *mp, *n; - - spin_lock(&xfs_mount_list_lock); - list_for_each_entry_safe(mp, n, &xfs_mount_list, m_mount_list) { - spin_unlock(&xfs_mount_list_lock); - xfs_inodegc_cpu_dead(mp, cpu); - xlog_cil_pcp_dead(mp->m_log, cpu); - spin_lock(&xfs_mount_list_lock); - } - spin_unlock(&xfs_mount_list_lock); - return 0; -} - -static int __init -xfs_cpu_hotplug_init(void) -{ - int error; - - error = cpuhp_setup_state_nocalls(CPUHP_XFS_DEAD, "xfs:dead", NULL, - xfs_cpu_dead); - if (error < 0) - xfs_alert(NULL, -"Failed to initialise CPU hotplug, error %d. XFS is non-functional.", - error); - return error; -} - -static void -xfs_cpu_hotplug_destroy(void) -{ - cpuhp_remove_state_nocalls(CPUHP_XFS_DEAD); -} - -#else /* !CONFIG_HOTPLUG_CPU */ -static inline int xfs_cpu_hotplug_init(void) { return 0; } -static inline void xfs_cpu_hotplug_destroy(void) {} -#endif - STATIC int __init init_xfs_fs(void) { @@ -2329,13 +2310,9 @@ init_xfs_fs(void) xfs_dir_startup(); - error = xfs_cpu_hotplug_init(); - if (error) - goto out; - error = xfs_init_caches(); if (error) - goto out_destroy_hp; + goto out; error = xfs_init_workqueues(); if (error) @@ -2353,10 +2330,12 @@ init_xfs_fs(void) if (error) goto out_cleanup_procfs; + xfs_debugfs = xfs_debugfs_mkdir("xfs", NULL); + xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj); if (!xfs_kset) { error = -ENOMEM; - goto out_sysctl_unregister; + goto out_debugfs_unregister; } xfsstats.xs_kobj.kobject.kset = xfs_kset; @@ -2372,11 +2351,15 @@ init_xfs_fs(void) if (error) goto out_free_stats; + error = xchk_global_stats_setup(xfs_debugfs); + if (error) + goto out_remove_stats_kobj; + #ifdef DEBUG xfs_dbg_kobj.kobject.kset = xfs_kset; error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug"); if (error) - goto out_remove_stats_kobj; + goto out_remove_scrub_stats; #endif error = xfs_qm_init(); @@ -2393,14 +2376,17 @@ init_xfs_fs(void) out_remove_dbg_kobj: #ifdef DEBUG xfs_sysfs_del(&xfs_dbg_kobj); - out_remove_stats_kobj: + out_remove_scrub_stats: #endif + xchk_global_stats_teardown(); + out_remove_stats_kobj: xfs_sysfs_del(&xfsstats.xs_kobj); out_free_stats: free_percpu(xfsstats.xs_stats); out_kset_unregister: kset_unregister(xfs_kset); - out_sysctl_unregister: + out_debugfs_unregister: + debugfs_remove(xfs_debugfs); xfs_sysctl_unregister(); out_cleanup_procfs: xfs_cleanup_procfs(); @@ -2410,8 +2396,6 @@ init_xfs_fs(void) xfs_destroy_workqueues(); out_destroy_caches: xfs_destroy_caches(); - out_destroy_hp: - xfs_cpu_hotplug_destroy(); out: return error; } @@ -2424,16 +2408,17 @@ exit_xfs_fs(void) #ifdef DEBUG xfs_sysfs_del(&xfs_dbg_kobj); #endif + xchk_global_stats_teardown(); xfs_sysfs_del(&xfsstats.xs_kobj); free_percpu(xfsstats.xs_stats); kset_unregister(xfs_kset); + debugfs_remove(xfs_debugfs); xfs_sysctl_unregister(); xfs_cleanup_procfs(); xfs_mru_cache_uninit(); xfs_destroy_workqueues(); xfs_destroy_caches(); xfs_uuid_table_free(); - xfs_cpu_hotplug_destroy(); } module_init(init_xfs_fs); diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 364e2c2648a8..302e6e5d6c7e 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -100,4 +100,6 @@ extern struct workqueue_struct *xfs_discard_wq; #define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) +struct dentry *xfs_debugfs_mkdir(const char *name, struct dentry *parent); + #endif /* __XFS_SUPER_H__ */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index f3cc204bb4bf..3926cf7f2a6e 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -22,6 +22,9 @@ * daddr: physical block number in 512b blocks * bbcount: number of blocks in a physical extent, in 512b blocks * + * rtx: physical rt extent number for extent mappings + * rtxcount: number of rt extents in an extent mapping + * * owner: reverse-mapping owner, usually inodes * * fileoff: file offset, in fs blocks @@ -802,36 +805,28 @@ DEFINE_INODE_EVENT(xfs_inode_inactivating); * ring buffer. Somehow this was only worth mentioning in the ftrace sample * code. */ -TRACE_DEFINE_ENUM(PE_SIZE_PTE); -TRACE_DEFINE_ENUM(PE_SIZE_PMD); -TRACE_DEFINE_ENUM(PE_SIZE_PUD); - TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); TRACE_EVENT(xfs_filemap_fault, - TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size, - bool write_fault), - TP_ARGS(ip, pe_size, write_fault), + TP_PROTO(struct xfs_inode *ip, unsigned int order, bool write_fault), + TP_ARGS(ip, order, write_fault), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) - __field(enum page_entry_size, pe_size) + __field(unsigned int, order) __field(bool, write_fault) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->pe_size = pe_size; + __entry->order = order; __entry->write_fault = write_fault; ), - TP_printk("dev %d:%d ino 0x%llx %s write_fault %d", + TP_printk("dev %d:%d ino 0x%llx order %u write_fault %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, - __print_symbolic(__entry->pe_size, - { PE_SIZE_PTE, "PTE" }, - { PE_SIZE_PMD, "PMD" }, - { PE_SIZE_PUD, "PUD" }), + __entry->order, __entry->write_fault) ) @@ -3829,6 +3824,51 @@ TRACE_EVENT(xfs_iunlink_update_dinode, __entry->new_ptr) ); +TRACE_EVENT(xfs_iunlink_reload_next, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + __entry->prev_agino = ip->i_prev_unlinked; + __entry->next_agino = ip->i_next_unlinked; + ), + TP_printk("dev %d:%d agno 0x%x agino 0x%x prev_unlinked 0x%x next_unlinked 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino, + __entry->prev_agino, + __entry->next_agino) +); + +TRACE_EVENT(xfs_inode_reload_unlinked_bucket, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + ), + TP_printk("dev %d:%d agno 0x%x agino 0x%x bucket %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino, + __entry->agino % XFS_AGI_UNLINKED_BUCKETS) +); + DECLARE_EVENT_CLASS(xfs_ag_inode_class, TP_PROTO(struct xfs_inode *ip), TP_ARGS(ip), diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 43e5c219aaed..a3975f325f4e 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -46,6 +46,17 @@ xfs_attr_grab_log_assist( if (xfs_sb_version_haslogxattrs(&mp->m_sb)) return 0; + /* + * Check if the filesystem featureset is new enough to set this log + * incompat feature bit. Strictly speaking, the minimum requirement is + * a V5 filesystem for the superblock field, but we'll require rmap + * or reflink to avoid having to deal with really old kernels. + */ + if (!xfs_has_reflink(mp) && !xfs_has_rmapbt(mp)) { + error = -EOPNOTSUPP; + goto drop_incompat; + } + /* Enable log-assisted xattrs. */ error = xfs_add_incompat_log_feature(mp, XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 789cfb74c146..b2c9b35df8f7 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -175,7 +175,7 @@ const struct address_space_operations zonefs_file_aops = { .read_folio = zonefs_read_folio, .readahead = zonefs_readahead, .writepages = zonefs_writepages, - .dirty_folio = filemap_dirty_folio, + .dirty_folio = iomap_dirty_folio, .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, .migrate_folio = filemap_migrate_folio, diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 9350221abfc5..9d1a9808fbbb 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -658,7 +658,8 @@ static struct inode *zonefs_get_file_inode(struct inode *dir, inode->i_ino = ino; inode->i_mode = z->z_mode; - inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime; + inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode, + inode_get_ctime(dir)); inode->i_uid = z->z_uid; inode->i_gid = z->z_gid; inode->i_size = z->z_wpoffset; @@ -694,7 +695,8 @@ static struct inode *zonefs_get_zgroup_inode(struct super_block *sb, inode->i_ino = ino; inode_init_owner(&nop_mnt_idmap, inode, root, S_IFDIR | 0555); inode->i_size = sbi->s_zgroup[ztype].g_nr_zones; - inode->i_ctime = inode->i_mtime = inode->i_atime = root->i_ctime; + inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode, + inode_get_ctime(root)); inode->i_private = &sbi->s_zgroup[ztype]; set_nlink(inode, 2); @@ -1317,7 +1319,7 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) inode->i_ino = bdev_nr_zones(sb->s_bdev); inode->i_mode = S_IFDIR | 0555; - inode->i_ctime = inode->i_mtime = inode->i_atime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_op = &zonefs_dir_inode_operations; inode->i_fop = &zonefs_dir_operations; inode->i_size = 2; |